public class CrawlConfig extends Object
| Constructor and Description |
|---|
CrawlConfig() |
| Modifier and Type | Method and Description |
|---|---|
void |
addAuthInfo(AuthInfo authInfo) |
List<AuthInfo> |
getAuthInfos() |
int |
getConnectionTimeout() |
String |
getCrawlStorageFolder() |
Collection<org.apache.http.message.BasicHeader> |
getDefaultHeaders()
Return a copy of the default header collection.
|
int |
getMaxConnectionsPerHost() |
int |
getMaxDepthOfCrawling() |
int |
getMaxDownloadSize() |
int |
getMaxOutgoingLinksToFollow() |
int |
getMaxPagesToFetch() |
int |
getMaxTotalConnections() |
int |
getPolitenessDelay() |
String |
getProxyHost() |
String |
getProxyPassword() |
int |
getProxyPort() |
String |
getProxyUsername() |
int |
getSocketTimeout() |
String |
getUserAgentString() |
boolean |
isFollowRedirects() |
boolean |
isIncludeBinaryContentInCrawling() |
boolean |
isIncludeHttpsPages() |
boolean |
isOnlineTldListUpdate() |
boolean |
isProcessBinaryContentInCrawling() |
boolean |
isResumableCrawling() |
boolean |
isShutdownOnEmptyQueue() |
void |
setAuthInfos(List<AuthInfo> authInfos) |
void |
setConnectionTimeout(int connectionTimeout) |
void |
setCrawlStorageFolder(String crawlStorageFolder)
The folder which will be used by crawler for storing the intermediate
crawl data.
|
void |
setDefaultHeaders(Collection<? extends org.apache.http.Header> defaultHeaders)
Set the default header collection (creating copies of the provided headers).
|
void |
setFollowRedirects(boolean followRedirects) |
void |
setIncludeBinaryContentInCrawling(boolean includeBinaryContentInCrawling) |
void |
setIncludeHttpsPages(boolean includeHttpsPages) |
void |
setMaxConnectionsPerHost(int maxConnectionsPerHost) |
void |
setMaxDepthOfCrawling(int maxDepthOfCrawling)
Maximum depth of crawling For unlimited depth this parameter should be set to -1
|
void |
setMaxDownloadSize(int maxDownloadSize) |
void |
setMaxOutgoingLinksToFollow(int maxOutgoingLinksToFollow) |
void |
setMaxPagesToFetch(int maxPagesToFetch)
Maximum number of pages to fetch For unlimited number of pages, this parameter should be set to -1
|
void |
setMaxTotalConnections(int maxTotalConnections) |
void |
setOnlineTldListUpdate(boolean online)
Should the TLD list be updated automatically on each run? Alternatively,
it can be loaded from the embedded tld-names.txt resource file that was
obtained from https://publicsuffix.org/list/effective_tld_names.dat
|
void |
setPolitenessDelay(int politenessDelay)
Politeness delay in milliseconds (delay between sending two requests to
the same host).
|
void |
setProcessBinaryContentInCrawling(boolean processBinaryContentInCrawling)
Should we process binary content such as images, audio, ...
|
void |
setProxyHost(String proxyHost) |
void |
setProxyPassword(String proxyPassword)
If crawler should run behind a proxy and user/pass is needed for
authentication in proxy, this parameter can be used for specifying the password.
|
void |
setProxyPort(int proxyPort) |
void |
setProxyUsername(String proxyUsername) |
void |
setResumableCrawling(boolean resumableCrawling)
If this feature is enabled, you would be able to resume a previously
stopped/crashed crawl.
|
void |
setShutdownOnEmptyQueue(boolean shutdown)
Should the crawler stop running when the queue is empty?
|
void |
setSocketTimeout(int socketTimeout) |
void |
setUserAgentString(String userAgentString)
user-agent string that is used for representing your crawler to web
servers.
|
String |
toString() |
void |
validate()
Validates the configs specified by this instance.
|
public void validate()
throws Exception
Exception - on Validation failpublic String getCrawlStorageFolder()
public void setCrawlStorageFolder(String crawlStorageFolder)
crawlStorageFolder - The folder for the crawler's storagepublic boolean isResumableCrawling()
public void setResumableCrawling(boolean resumableCrawling)
resumableCrawling - Should crawling be resumable between runs ?public int getMaxDepthOfCrawling()
public void setMaxDepthOfCrawling(int maxDepthOfCrawling)
maxDepthOfCrawling - Depth of crawling (all links on current page = depth of 1)public int getMaxPagesToFetch()
public void setMaxPagesToFetch(int maxPagesToFetch)
maxPagesToFetch - How many pages to fetch from all threads together ?public String getUserAgentString()
public void setUserAgentString(String userAgentString)
userAgentString - Custom userAgent string to use as your crawler's identifierpublic Collection<org.apache.http.message.BasicHeader> getDefaultHeaders()
public void setDefaultHeaders(Collection<? extends org.apache.http.Header> defaultHeaders)
public int getPolitenessDelay()
public void setPolitenessDelay(int politenessDelay)
politenessDelay - the delay in milliseconds.public boolean isIncludeHttpsPages()
public void setIncludeHttpsPages(boolean includeHttpsPages)
includeHttpsPages - Should we crawl https pages?public boolean isIncludeBinaryContentInCrawling()
public void setIncludeBinaryContentInCrawling(boolean includeBinaryContentInCrawling)
includeBinaryContentInCrawling - Should we fetch binary content such as images, audio, ...?public boolean isProcessBinaryContentInCrawling()
public void setProcessBinaryContentInCrawling(boolean processBinaryContentInCrawling)
public int getMaxConnectionsPerHost()
public void setMaxConnectionsPerHost(int maxConnectionsPerHost)
maxConnectionsPerHost - Maximum Connections per hostpublic int getMaxTotalConnections()
public void setMaxTotalConnections(int maxTotalConnections)
maxTotalConnections - Maximum total connectionspublic int getSocketTimeout()
public void setSocketTimeout(int socketTimeout)
socketTimeout - Socket timeout in millisecondspublic int getConnectionTimeout()
public void setConnectionTimeout(int connectionTimeout)
connectionTimeout - Connection timeout in millisecondspublic int getMaxOutgoingLinksToFollow()
public void setMaxOutgoingLinksToFollow(int maxOutgoingLinksToFollow)
maxOutgoingLinksToFollow - Max number of outgoing links which are processed from a pagepublic int getMaxDownloadSize()
public void setMaxDownloadSize(int maxDownloadSize)
maxDownloadSize - Max allowed size of a page. Pages larger than this size will not be fetched.public boolean isFollowRedirects()
public void setFollowRedirects(boolean followRedirects)
followRedirects - Should we follow redirects?public boolean isShutdownOnEmptyQueue()
public void setShutdownOnEmptyQueue(boolean shutdown)
public boolean isOnlineTldListUpdate()
public void setOnlineTldListUpdate(boolean online)
public String getProxyHost()
public void setProxyHost(String proxyHost)
proxyHost - If crawler should run behind a proxy, this parameter can be used for specifying the proxy host.public int getProxyPort()
public void setProxyPort(int proxyPort)
proxyPort - If crawler should run behind a proxy, this parameter can be used for specifying the proxy port.public String getProxyUsername()
public void setProxyUsername(String proxyUsername)
proxyUsername - If crawler should run behind a proxy and user/pass is needed for
authentication in proxy, this parameter can be used for specifying the username.public String getProxyPassword()
public void setProxyPassword(String proxyPassword)
proxyPassword - String Passwordpublic void addAuthInfo(AuthInfo authInfo)
public void setAuthInfos(List<AuthInfo> authInfos)
authInfos - authenticationInformations to setCopyright © 2015. All rights reserved.