Crawler config » History » Version 2
« Previous -
Version 2/4
(diff) -
Next » -
Current version
Prokopis Prokopidis, 2016-02-05 01:39 PM
<?xml version="1.0" encoding="UTF-8"?>
<configuration>
<agent>
<email>yourmail@mail.com</email>
<web_address>www.youraddress.com</web_address>
</agent>
<classifier>
<min_content_terms>
<value>2</value>
<description>Minimum number of terms that must exist in clean
content of each web page in order to be stored.</description>
</min_content_terms>
<min_unique_content_terms>
<value>2</value>
<description>Minimum unique terms that must exist in clean content</description>
</min_unique_content_terms>
<max_depth>
<value>10</value>
<description>Maximum depth to crawl before abandoning a specific path. Depth
is increased every time a link is extracted from a non-relevant web page.</description>
</max_depth>
</classifier>
<fetcher>
<fetch_buffer_size>
<description>Max number of urls to fetch per run</description>
<value>512</value>
</fetch_buffer_size>
<socket_timeout>
<value>10000</value>
<description>Socket timeout in milliseconds(per URL)</description>
</socket_timeout>
<connection_timeout>
<value>10000</value>
<description>Connection timeout in milliseconds(per URL)</description>
</connection_timeout>
<max_retry_count>
<value>2</value>
<description>Max number of attempts to fetch a Web page before giving up</description>
</max_retry_count>
<min_response_rate>
<value>0</value>
<description>Min bytes-per-seconds for fetching a web page</description>
</min_response_rate>
<valid_mime_types>
<mime_type value="text/html" />
<mime_type value="text/plain" />
<mime_type value="application/xhtml+xml" />
<description>Accepted mime types</description>
</valid_mime_types>
<crawl_delay>
<value>1500</value>
<description>delay in milliseconds between requests</description>
</crawl_delay>
<max_content_size>
<value>531072</value>
<description>Max content size (bytes) for downloading a web page</description>
</max_content_size>
<max_requests_per_run>
<value>512</value>
<description>Max fetch set size per run (Sets are made by URLs from the same host)</description>
</max_requests_per_run>
<max_requests_per_host_per_run>
<value>512</value>
<description>Max URLs from a specific host per run</description>
</max_requests_per_host_per_run>
<max_connections_per_host>
<value>32</value>
<description>Max number of fetching threads for each host</description>
</max_connections_per_host>
<max_fetched_per_host>
<value>500000</value>
<description>Max web pages to fetch per host</description>
</max_fetched_per_host>
<max_redirects>
<value>5</value>
<descriptions>Max number of redirects</descriptions>
</max_redirects>
<request_timeout>
<value>600000</value>
<description>Max time to wait for Fetcher to get all URLs in a run</description>
</request_timeout>
</fetcher>
</configuration>