Project

General

Profile

Crawler config » History » Version 2

« Previous - Version 2/4 (diff) - Next » - Current version
Prokopis Prokopidis, 2016-02-05 01:39 PM


<?xml version="1.0" encoding="UTF-8"?>
<configuration>
    <agent>
        <email>yourmail@mail.com</email>
        <web_address>www.youraddress.com</web_address>
    </agent>
    <classifier>
        <min_content_terms>
            <value>2</value>
            <description>Minimum number of terms that must exist in clean
                content of each web page in order to be stored.</description>
        </min_content_terms>
        <min_unique_content_terms>
            <value>2</value>
            <description>Minimum unique terms that must exist in clean content</description>
        </min_unique_content_terms>
        <max_depth>
            <value>10</value>
            <description>Maximum depth to crawl before abandoning a specific path. Depth
            is increased every time a link is extracted from a non-relevant web page.</description>
        </max_depth>
    </classifier>
    <fetcher>
        <fetch_buffer_size>
            <description>Max number of urls to fetch per run</description>
            <value>512</value>
        </fetch_buffer_size>
        <socket_timeout>
            <value>10000</value>
            <description>Socket timeout in milliseconds(per URL)</description>
        </socket_timeout>
        <connection_timeout>
            <value>10000</value>
            <description>Connection timeout in milliseconds(per URL)</description>
        </connection_timeout>
        <max_retry_count>
            <value>2</value>
            <description>Max number of attempts to fetch a Web page before giving up</description>
        </max_retry_count>
        <min_response_rate>
            <value>0</value>
            <description>Min bytes-per-seconds for fetching a web page</description>
        </min_response_rate>
        <valid_mime_types>
            <mime_type value="text/html" />
            <mime_type value="text/plain" />
            <mime_type value="application/xhtml+xml" />
            <description>Accepted mime types</description>
        </valid_mime_types>
        <crawl_delay>
            <value>1500</value>
            <description>delay in milliseconds between requests</description>
        </crawl_delay>
        <max_content_size>
            <value>531072</value>
            <description>Max content size (bytes) for downloading a web page</description>
        </max_content_size>
        <max_requests_per_run>
            <value>512</value>
            <description>Max fetch set size per run (Sets are made by URLs from the same host)</description>
        </max_requests_per_run>
        <max_requests_per_host_per_run>
            <value>512</value>
            <description>Max URLs from a specific host per run</description>
        </max_requests_per_host_per_run>
        <max_connections_per_host>
            <value>32</value>
            <description>Max number of fetching threads for each host</description>
        </max_connections_per_host>     
        <max_fetched_per_host>
            <value>500000</value>
            <description>Max web pages to fetch per host</description>
        </max_fetched_per_host>
        <max_redirects>
            <value>5</value>
            <descriptions>Max number of redirects</descriptions>
        </max_redirects>
        <request_timeout>
            <value>600000</value>
            <description>Max time to wait for Fetcher to get all URLs in a run</description>
        </request_timeout>
    </fetcher>  
</configuration>