public class Spider extends Object implements Runnable, Task
Downloader,
Scheduler,
PageProcessor,
Pipeline| 限定符和类型 | 类和说明 |
|---|---|
static class |
Spider.Status |
| 限定符和类型 | 字段和说明 |
|---|---|
protected boolean |
destroyWhenExit |
protected Downloader |
downloader |
protected ExecutorService |
executorService |
protected boolean |
exitWhenComplete |
protected org.slf4j.Logger |
logger |
protected PageProcessor |
pageProcessor |
protected List<Pipeline> |
pipelines |
protected Scheduler |
scheduler |
protected Site |
site |
protected boolean |
spawnUrl |
protected List<Request> |
startRequests |
protected AtomicInteger |
stat |
protected static int |
STAT_INIT |
protected static int |
STAT_RUNNING |
protected static int |
STAT_STOPPED |
protected int |
threadNum |
protected CountableThreadPool |
threadPool |
protected String |
uuid |
| 构造器和说明 |
|---|
Spider(PageProcessor pageProcessor)
create a spider with pageProcessor.
|
| 限定符和类型 | 方法和说明 |
|---|---|
Spider |
addPipeline(Pipeline pipeline)
add a pipeline for Spider
|
Spider |
addRequest(Request... requests)
Add urls with information to crawl.
|
Spider |
addUrl(String... urls)
Add urls to crawl.
|
protected void |
checkIfRunning() |
Spider |
clearPipeline()
clear the pipelines set
|
void |
close() |
static Spider |
create(PageProcessor pageProcessor)
create a spider with pageProcessor.
|
Spider |
downloader(Downloader downloader)
已过时。
|
protected void |
extractAndAddRequests(Page page,
boolean spawnUrl) |
<T> T |
get(String url) |
<T> List<T> |
getAll(Collection<String> urls)
Download urls synchronizing.
|
protected CollectorPipeline |
getCollectorPipeline() |
long |
getPageCount()
Get page count downloaded by spider.
|
Scheduler |
getScheduler() |
Site |
getSite()
site of a task
|
List<SpiderListener> |
getSpiderListeners() |
Date |
getStartTime() |
Spider.Status |
getStatus()
Get running status by spider.
|
int |
getThreadAlive()
Get thread count which is running
|
String |
getUUID()
unique id for a task.
|
protected void |
initComponent() |
boolean |
isExitWhenComplete() |
boolean |
isSpawnUrl() |
protected void |
onError(Request request) |
protected void |
onSuccess(Request request) |
Spider |
pipeline(Pipeline pipeline)
已过时。
|
protected void |
processRequest(Request request) |
void |
run() |
void |
runAsync() |
Spider |
scheduler(Scheduler scheduler)
set scheduler for Spider
|
Spider |
setDownloader(Downloader downloader)
set the downloader of spider
|
void |
setEmptySleepTime(int emptySleepTime)
Set wait time when no url is polled.
|
Spider |
setExecutorService(ExecutorService executorService) |
Spider |
setExitWhenComplete(boolean exitWhenComplete)
Exit when complete.
|
Spider |
setPipelines(List<Pipeline> pipelines)
set pipelines for Spider
|
Spider |
setScheduler(Scheduler scheduler)
set scheduler for Spider
|
Spider |
setSpawnUrl(boolean spawnUrl)
Whether add urls extracted to download.
|
Spider |
setSpiderListeners(List<SpiderListener> spiderListeners) |
Spider |
setUUID(String uuid)
Set an uuid for spider.
|
protected void |
sleep(int time) |
void |
start() |
Spider |
startRequest(List<Request> startRequests)
Set startUrls of Spider.
|
Spider |
startUrls(List<String> startUrls)
Set startUrls of Spider.
|
void |
stop() |
void |
test(String... urls)
Process specific urls without url discovering.
|
Spider |
thread(ExecutorService executorService,
int threadNum)
start with more than one threads
|
Spider |
thread(int threadNum)
start with more than one threads
|
protected Downloader downloader
protected PageProcessor pageProcessor
protected Site site
protected String uuid
protected Scheduler scheduler
protected org.slf4j.Logger logger
protected CountableThreadPool threadPool
protected ExecutorService executorService
protected int threadNum
protected AtomicInteger stat
protected boolean exitWhenComplete
protected static final int STAT_INIT
protected static final int STAT_RUNNING
protected static final int STAT_STOPPED
protected boolean spawnUrl
protected boolean destroyWhenExit
public Spider(PageProcessor pageProcessor)
pageProcessor - pageProcessorpublic static Spider create(PageProcessor pageProcessor)
pageProcessor - pageProcessorPageProcessorpublic Spider startUrls(List<String> startUrls)
startUrls - startUrlspublic Spider startRequest(List<Request> startRequests)
startRequests - startRequestspublic Spider setUUID(String uuid)
uuid - uuidpublic Spider scheduler(Scheduler scheduler)
scheduler - schedulersetScheduler(us.codecraft.webmagic.scheduler.Scheduler)public Spider setScheduler(Scheduler scheduler)
scheduler - schedulerSchedulerpublic Spider pipeline(Pipeline pipeline)
pipeline - pipelineaddPipeline(us.codecraft.webmagic.pipeline.Pipeline)public Spider addPipeline(Pipeline pipeline)
pipeline - pipelinePipelinepublic Spider setPipelines(List<Pipeline> pipelines)
pipelines - pipelinesPipelinepublic Spider clearPipeline()
public Spider downloader(Downloader downloader)
downloader - downloadersetDownloader(us.codecraft.webmagic.downloader.Downloader)public Spider setDownloader(Downloader downloader)
downloader - downloaderDownloaderprotected void initComponent()
protected void onError(Request request)
protected void onSuccess(Request request)
public void close()
public void test(String... urls)
urls - urls to processprotected void processRequest(Request request)
protected void sleep(int time)
protected void extractAndAddRequests(Page page, boolean spawnUrl)
protected void checkIfRunning()
public void runAsync()
public <T> List<T> getAll(Collection<String> urls)
urls - urlsprotected CollectorPipeline getCollectorPipeline()
public <T> T get(String url)
public Spider addRequest(Request... requests)
requests - requestspublic void start()
public void stop()
public Spider thread(int threadNum)
threadNum - threadNumpublic Spider thread(ExecutorService executorService, int threadNum)
executorService - executorService to run the spiderthreadNum - threadNumpublic boolean isExitWhenComplete()
public Spider setExitWhenComplete(boolean exitWhenComplete)
exitWhenComplete - exitWhenCompletepublic boolean isSpawnUrl()
public long getPageCount()
public Spider.Status getStatus()
Spider.Statuspublic int getThreadAlive()
public Spider setSpawnUrl(boolean spawnUrl)
spawnUrl - spawnUrlpublic Spider setExecutorService(ExecutorService executorService)
public List<SpiderListener> getSpiderListeners()
public Spider setSpiderListeners(List<SpiderListener> spiderListeners)
public Date getStartTime()
public Scheduler getScheduler()
public void setEmptySleepTime(int emptySleepTime)
emptySleepTime - In MILLISECONDS.Copyright © 2016. All rights reserved.