Project

General

Profile

FBC config » History » Version 5

Prokopis Prokopidis, 2016-02-16 06:44 PM

1 4 Prokopis Prokopidis
```xml
2 1 Vassilis Papavassiliou
<?xml version="1.0" encoding="UTF-8"?>
3 1 Vassilis Papavassiliou
<configuration>
4 1 Vassilis Papavassiliou
	<agent>
5 1 Vassilis Papavassiliou
		<email>yourmail@mail.com</email>
6 1 Vassilis Papavassiliou
		<web_address>www.youraddress.com</web_address>
7 1 Vassilis Papavassiliou
	</agent>
8 5 Prokopis Prokopidis
	<resourceCreator>
9 5 Prokopis Prokopidis
		<organization>ILSP</organization>
10 5 Prokopis Prokopidis
		<organizationURL>http://www.ilsp.gr</organizationURL>
11 5 Prokopis Prokopidis
	</resourceCreator>
12 5 Prokopis Prokopidis
	<fundingProject>
13 5 Prokopis Prokopidis
		<projectId>ELRC</projectId>
14 5 Prokopis Prokopidis
		<projectURL>http://lr-coordination.eu/</projectURL>
15 5 Prokopis Prokopidis
	</fundingProject>
16 1 Vassilis Papavassiliou
	<classifier>
17 1 Vassilis Papavassiliou
		<min_content_terms>
18 1 Vassilis Papavassiliou
			<value>2</value>
19 1 Vassilis Papavassiliou
			<description>Minimum number of terms that must exist in clean
20 3 Vassilis Papavassiliou
				content of each web page in order to be stored.This number
21 3 Vassilis Papavassiliou
				is multiplied with the median value of the terms' weights and
22 3 Vassilis Papavassiliou
				the result is the threshold for the absolute relevance score.</description>
23 1 Vassilis Papavassiliou
		</min_content_terms>
24 1 Vassilis Papavassiliou
		<min_unique_content_terms>
25 1 Vassilis Papavassiliou
			<value>2</value>
26 1 Vassilis Papavassiliou
			<description>Minimum unique terms that must exist in clean content</description>
27 3 Vassilis Papavassiliou
		</min_unique_content_terms>
28 3 Vassilis Papavassiliou
		<relative_relevance_threshold>
29 3 Vassilis Papavassiliou
			<value>0.2</value>
30 3 Vassilis Papavassiliou
			<description>The absolute relevance score is divided by the length
31 3 Vassilis Papavassiliou
			(in terms of tokens) of the clean content of a document and the
32 3 Vassilis Papavassiliou
			calculated relative relevance score is compared with this value</description>
33 1 Vassilis Papavassiliou
		</relative_relevance_threshold>
34 1 Vassilis Papavassiliou
		<max_depth>
35 5 Prokopis Prokopidis
			<value>20</value>
36 1 Vassilis Papavassiliou
			<description>Maximum depth to crawl before abandoning a specific path. Depth
37 1 Vassilis Papavassiliou
			is increased every time a link is extracted from a non-relevant web page.</description>
38 3 Vassilis Papavassiliou
		</max_depth>
39 3 Vassilis Papavassiliou
	</classifier>
40 1 Vassilis Papavassiliou
	<aligner>
41 3 Vassilis Papavassiliou
		<win_align_path>
42 5 Prokopis Prokopidis
			<value>hunalign-1.2-windows64/hunalign.exe</value>
43 3 Vassilis Papavassiliou
			<description>relative path to executable of hunalign for windows.
44 1 Vassilis Papavassiliou
			The main hugnalign directory is supposed to be next to the crawler's jar</description>
45 3 Vassilis Papavassiliou
		</win_align_path>
46 3 Vassilis Papavassiliou
		<lin_align_path>
47 5 Prokopis Prokopidis
			<value>hunalign/src/hunalign/hunalign</value>
48 1 Vassilis Papavassiliou
			<description>relative path to executable of hunalign for linux.
49 3 Vassilis Papavassiliou
			The main hugnalign directory is supposed to be next to the crawler's jar</description>
50 3 Vassilis Papavassiliou
		</lin_align_path>
51 3 Vassilis Papavassiliou
		<align_dict>
52 5 Prokopis Prokopidis
			<value>hunalign/data</value>
53 3 Vassilis Papavassiliou
			<description>relative path to the dictionaries of hunalign.
54 1 Vassilis Papavassiliou
			The main hugnalign directory is supposed to be next to the crawler's jar</description>
55 1 Vassilis Papavassiliou
		</align_dict>
56 1 Vassilis Papavassiliou
	</aligner>
57 1 Vassilis Papavassiliou
	<fetcher>
58 1 Vassilis Papavassiliou
		<fetch_buffer_size>
59 1 Vassilis Papavassiliou
			<description>Max number of urls to fetch per run</description>
60 5 Prokopis Prokopidis
			<value>1024</value>
61 1 Vassilis Papavassiliou
		</fetch_buffer_size>
62 1 Vassilis Papavassiliou
		<socket_timeout>
63 5 Prokopis Prokopidis
			<value>10000</value>
64 1 Vassilis Papavassiliou
			<description>Socket timeout in milliseconds(per URL)</description>
65 1 Vassilis Papavassiliou
		</socket_timeout>
66 1 Vassilis Papavassiliou
		<connection_timeout>
67 5 Prokopis Prokopidis
			<value>10000</value>
68 1 Vassilis Papavassiliou
			<description>Connection timeout in milliseconds(per URL)</description>
69 1 Vassilis Papavassiliou
		</connection_timeout>
70 1 Vassilis Papavassiliou
		<max_retry_count>
71 1 Vassilis Papavassiliou
			<value>2</value>
72 1 Vassilis Papavassiliou
			<description>Max number of attempts to fetch a Web page before giving up</description>
73 1 Vassilis Papavassiliou
		</max_retry_count>
74 1 Vassilis Papavassiliou
		<min_response_rate>
75 1 Vassilis Papavassiliou
			<value>0</value>
76 1 Vassilis Papavassiliou
			<description>Min bytes-per-seconds for fetching a web page</description>
77 1 Vassilis Papavassiliou
		</min_response_rate>
78 1 Vassilis Papavassiliou
		<valid_mime_types>
79 1 Vassilis Papavassiliou
			<mime_type value="text/html" />
80 1 Vassilis Papavassiliou
			<mime_type value="text/plain" />
81 1 Vassilis Papavassiliou
			<mime_type value="application/xhtml+xml" />
82 5 Prokopis Prokopidis
			<mime_type value="application/pdf" />
83 5 Prokopis Prokopidis
			<mime_type value="application/x-pdf" /> 
84 1 Vassilis Papavassiliou
			<description>Accepted mime types</description>
85 3 Vassilis Papavassiliou
		</valid_mime_types>
86 1 Vassilis Papavassiliou
		<crawl_delay>
87 1 Vassilis Papavassiliou
			<value>1000</value>
88 1 Vassilis Papavassiliou
			<description>delay in milliseconds between requests</description>
89 1 Vassilis Papavassiliou
		</crawl_delay>
90 1 Vassilis Papavassiliou
		<max_content_size>
91 1 Vassilis Papavassiliou
			<value>531072</value>
92 1 Vassilis Papavassiliou
			<description>Max content size (bytes) for downloading a web page</description>
93 1 Vassilis Papavassiliou
		</max_content_size>
94 1 Vassilis Papavassiliou
		<max_requests_per_run>
95 5 Prokopis Prokopidis
			<value>1024</value>
96 1 Vassilis Papavassiliou
			<description>Max fetch set size per run (Sets are made by URLs from the same host)</description>
97 1 Vassilis Papavassiliou
		</max_requests_per_run>
98 1 Vassilis Papavassiliou
		<max_requests_per_host_per_run>
99 5 Prokopis Prokopidis
			<value>1024</value>
100 1 Vassilis Papavassiliou
			<description>Max URLs from a specific host per run</description>
101 1 Vassilis Papavassiliou
		</max_requests_per_host_per_run>
102 1 Vassilis Papavassiliou
		<max_connections_per_host>
103 3 Vassilis Papavassiliou
			<value>100</value>
104 1 Vassilis Papavassiliou
			<description>Max number of fetching threads for each host</description>
105 1 Vassilis Papavassiliou
		</max_connections_per_host>		
106 1 Vassilis Papavassiliou
		<max_fetched_per_host>
107 1 Vassilis Papavassiliou
			<value>10000000</value>
108 1 Vassilis Papavassiliou
			<description>Max web pages to fetch per host</description>
109 1 Vassilis Papavassiliou
		</max_fetched_per_host>
110 1 Vassilis Papavassiliou
		<max_redirects>
111 1 Vassilis Papavassiliou
			<value>5</value>
112 1 Vassilis Papavassiliou
			<descriptions>Max number of redirects</descriptions>
113 1 Vassilis Papavassiliou
		</max_redirects>
114 1 Vassilis Papavassiliou
		<request_timeout>
115 5 Prokopis Prokopidis
			<!--<value>6000000</value> -->
116 1 Vassilis Papavassiliou
			<value>600000</value>
117 1 Vassilis Papavassiliou
			<description>Max time to wait for Fetcher to get all URLs in a run</description>
118 1 Vassilis Papavassiliou
		</request_timeout>
119 1 Vassilis Papavassiliou
	</fetcher>	
120 1 Vassilis Papavassiliou
</configuration>
121 4 Prokopis Prokopidis
```