Project

General

Profile

Crawler config » History » Version 3

Prokopis Prokopidis, 2016-02-16 10:09 AM

1 2 Prokopis Prokopidis
```xml
2 2 Prokopis Prokopidis
<?xml version="1.0" encoding="UTF-8"?>
3 1 Prokopis Prokopidis
<configuration>
4 1 Prokopis Prokopidis
	<agent>
5 1 Prokopis Prokopidis
		<email>yourmail@mail.com</email>
6 1 Prokopis Prokopidis
		<web_address>www.youraddress.com</web_address>
7 1 Prokopis Prokopidis
	</agent>
8 3 Prokopis Prokopidis
	<resourceCreator>
9 3 Prokopis Prokopidis
		<organization>ILSP</organization>
10 3 Prokopis Prokopidis
		<organizationURL>http://www.ilsp.gr</organizationURL>
11 3 Prokopis Prokopidis
	</resourceCreator>
12 3 Prokopis Prokopidis
	<fundingProject>
13 3 Prokopis Prokopidis
		<projectId>Project</projectId>
14 3 Prokopis Prokopidis
		<projectURL>http://www.example.org</projectURL>
15 3 Prokopis Prokopidis
	</fundingProject>
16 3 Prokopis Prokopidis
17 1 Prokopis Prokopidis
	<classifier>
18 1 Prokopis Prokopidis
		<min_content_terms>
19 1 Prokopis Prokopidis
			<value>2</value>
20 1 Prokopis Prokopidis
			<description>Minimum number of terms that must exist in clean
21 1 Prokopis Prokopidis
				content of each web page in order to be stored.</description>
22 1 Prokopis Prokopidis
		</min_content_terms>
23 1 Prokopis Prokopidis
		<min_unique_content_terms>
24 1 Prokopis Prokopidis
			<value>2</value>
25 1 Prokopis Prokopidis
			<description>Minimum unique terms that must exist in clean content</description>
26 1 Prokopis Prokopidis
		</min_unique_content_terms>
27 1 Prokopis Prokopidis
		<max_depth>
28 1 Prokopis Prokopidis
			<value>10</value>
29 1 Prokopis Prokopidis
			<description>Maximum depth to crawl before abandoning a specific path. Depth
30 1 Prokopis Prokopidis
			is increased every time a link is extracted from a non-relevant web page.</description>
31 1 Prokopis Prokopidis
		</max_depth>
32 1 Prokopis Prokopidis
	</classifier>
33 1 Prokopis Prokopidis
	<fetcher>
34 1 Prokopis Prokopidis
		<fetch_buffer_size>
35 1 Prokopis Prokopidis
			<description>Max number of urls to fetch per run</description>
36 1 Prokopis Prokopidis
			<value>512</value>
37 1 Prokopis Prokopidis
		</fetch_buffer_size>
38 1 Prokopis Prokopidis
		<socket_timeout>
39 1 Prokopis Prokopidis
			<value>10000</value>
40 1 Prokopis Prokopidis
			<description>Socket timeout in milliseconds(per URL)</description>
41 1 Prokopis Prokopidis
		</socket_timeout>
42 1 Prokopis Prokopidis
		<connection_timeout>
43 1 Prokopis Prokopidis
			<value>10000</value>
44 1 Prokopis Prokopidis
			<description>Connection timeout in milliseconds(per URL)</description>
45 1 Prokopis Prokopidis
		</connection_timeout>
46 1 Prokopis Prokopidis
		<max_retry_count>
47 1 Prokopis Prokopidis
			<value>2</value>
48 1 Prokopis Prokopidis
			<description>Max number of attempts to fetch a Web page before giving up</description>
49 1 Prokopis Prokopidis
		</max_retry_count>
50 1 Prokopis Prokopidis
		<min_response_rate>
51 1 Prokopis Prokopidis
			<value>0</value>
52 1 Prokopis Prokopidis
			<description>Min bytes-per-seconds for fetching a web page</description>
53 1 Prokopis Prokopidis
		</min_response_rate>
54 1 Prokopis Prokopidis
		<valid_mime_types>
55 1 Prokopis Prokopidis
			<mime_type value="text/html" />
56 1 Prokopis Prokopidis
			<mime_type value="text/plain" />
57 1 Prokopis Prokopidis
			<mime_type value="application/xhtml+xml" />
58 1 Prokopis Prokopidis
			<description>Accepted mime types</description>
59 1 Prokopis Prokopidis
		</valid_mime_types>
60 1 Prokopis Prokopidis
		<crawl_delay>
61 1 Prokopis Prokopidis
			<value>1500</value>
62 1 Prokopis Prokopidis
			<description>delay in milliseconds between requests</description>
63 1 Prokopis Prokopidis
		</crawl_delay>
64 1 Prokopis Prokopidis
		<max_content_size>
65 1 Prokopis Prokopidis
			<value>531072</value>
66 1 Prokopis Prokopidis
			<description>Max content size (bytes) for downloading a web page</description>
67 1 Prokopis Prokopidis
		</max_content_size>
68 1 Prokopis Prokopidis
		<max_requests_per_run>
69 1 Prokopis Prokopidis
			<value>512</value>
70 1 Prokopis Prokopidis
			<description>Max fetch set size per run (Sets are made by URLs from the same host)</description>
71 1 Prokopis Prokopidis
		</max_requests_per_run>
72 1 Prokopis Prokopidis
		<max_requests_per_host_per_run>
73 1 Prokopis Prokopidis
			<value>512</value>
74 1 Prokopis Prokopidis
			<description>Max URLs from a specific host per run</description>
75 1 Prokopis Prokopidis
		</max_requests_per_host_per_run>
76 1 Prokopis Prokopidis
		<max_connections_per_host>
77 1 Prokopis Prokopidis
			<value>32</value>
78 1 Prokopis Prokopidis
			<description>Max number of fetching threads for each host</description>
79 1 Prokopis Prokopidis
		</max_connections_per_host>		
80 1 Prokopis Prokopidis
		<max_fetched_per_host>
81 1 Prokopis Prokopidis
			<value>500000</value>
82 1 Prokopis Prokopidis
			<description>Max web pages to fetch per host</description>
83 1 Prokopis Prokopidis
		</max_fetched_per_host>
84 1 Prokopis Prokopidis
		<max_redirects>
85 1 Prokopis Prokopidis
			<value>5</value>
86 1 Prokopis Prokopidis
			<descriptions>Max number of redirects</descriptions>
87 1 Prokopis Prokopidis
		</max_redirects>
88 1 Prokopis Prokopidis
		<request_timeout>
89 1 Prokopis Prokopidis
			<value>600000</value>
90 1 Prokopis Prokopidis
			<description>Max time to wait for Fetcher to get all URLs in a run</description>
91 1 Prokopis Prokopidis
		</request_timeout>
92 1 Prokopis Prokopidis
	</fetcher>	
93 1 Prokopis Prokopidis
</configuration>
94 2 Prokopis Prokopidis
```