package bixo.operations;

import bixo.datum.GroupedUrlDatum;
import bixo.datum.ScoredUrlDatum;
import bixo.datum.UrlStatus;
import bixo.fetcher.BaseFetcher;
import bixo.hadoop.FetchCounters;
import bixo.robots.BaseRobotRules;
import bixo.robots.BaseRobotsParser;
import bixo.robots.RobotUtils;
import bixo.utils.DomainInfo;
import bixo.utils.DomainNames;
import bixo.utils.GroupingKey;
import cascading.tuple.TupleEntryCollector;
import com.bixolabs.cascading.LoggingFlowProcess;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.Queue;
import org.apache.log4j.Logger;

/* loaded from: input_file:bixo/operations/ProcessRobotsTask.class */
public class ProcessRobotsTask implements Runnable {
    private static final Logger LOGGER = Logger.getLogger(ProcessRobotsTask.class);
    private String _protocolAndDomain;
    private BaseScoreGenerator _scorer;
    private Queue<GroupedUrlDatum> _urls;
    private BaseFetcher _fetcher;
    private TupleEntryCollector _collector;
    private BaseRobotsParser _parser;
    private LoggingFlowProcess _flowProcess;

    public ProcessRobotsTask(String str, BaseScoreGenerator baseScoreGenerator, Queue<GroupedUrlDatum> queue, BaseFetcher baseFetcher, BaseRobotsParser baseRobotsParser, TupleEntryCollector tupleEntryCollector, LoggingFlowProcess loggingFlowProcess) {
        this._protocolAndDomain = str;
        this._scorer = baseScoreGenerator;
        this._urls = queue;
        this._fetcher = baseFetcher;
        this._parser = baseRobotsParser;
        this._collector = tupleEntryCollector;
        this._flowProcess = loggingFlowProcess;
    }

    public static void emptyQueue(Queue<GroupedUrlDatum> queue, String str, TupleEntryCollector tupleEntryCollector) {
        while (true) {
            GroupedUrlDatum poll = queue.poll();
            if (poll == null) {
                return;
            }
            ScoredUrlDatum scoredUrlDatum = new ScoredUrlDatum(poll.getUrl(), str, UrlStatus.UNFETCHED, 1.0d);
            scoredUrlDatum.setPayload(poll.getPayload());
            synchronized (tupleEntryCollector) {
                tupleEntryCollector.add(scoredUrlDatum.getTuple());
            }
        }
    }

    @Override // java.lang.Runnable
    public void run() {
        FetchCounters fetchCounters;
        ScoredUrlDatum scoredUrlDatum;
        this._flowProcess.increment(FetchCounters.DOMAINS_PROCESSING, 1);
        try {
            try {
                try {
                    DomainInfo domainInfo = new DomainInfo(this._protocolAndDomain);
                    if (!domainInfo.isValidHostAddress()) {
                        throw new UnknownHostException(this._protocolAndDomain);
                    }
                    if (LOGGER.isTraceEnabled()) {
                        LOGGER.trace(String.format("Resolved %s to %s", this._protocolAndDomain, domainInfo.getHostAddress()));
                    }
                    String domain = domainInfo.getDomain();
                    String pld = DomainNames.getPLD(domain);
                    if (this._scorer.isGoodDomain(domain, pld)) {
                        BaseRobotRules robotRules = RobotUtils.getRobotRules(this._fetcher, this._parser, new URL(domainInfo.getProtocolAndDomain() + "/robots.txt"));
                        String str = null;
                        boolean isDeferVisits = robotRules.isDeferVisits();
                        if (isDeferVisits) {
                            LOGGER.debug("Deferring visits to URLs from " + domainInfo.getDomain());
                            this._flowProcess.increment(FetchCounters.DOMAINS_DEFERRED, 1);
                        } else {
                            str = GroupingKey.makeGroupingKey(domainInfo.getHostAddress(), robotRules.getCrawlDelay());
                            this._flowProcess.increment(FetchCounters.DOMAINS_FINISHED, 1);
                        }
                        while (true) {
                            GroupedUrlDatum poll = this._urls.poll();
                            if (poll == null) {
                                break;
                            }
                            String url = poll.getUrl();
                            if (isDeferVisits) {
                                fetchCounters = FetchCounters.URLS_DEFERRED;
                                scoredUrlDatum = new ScoredUrlDatum(url, GroupingKey.DEFERRED_GROUPING_KEY, UrlStatus.SKIPPED_DEFERRED, 0.0d);
                            } else if (robotRules.isAllowed(url)) {
                                fetchCounters = FetchCounters.URLS_ACCEPTED;
                                scoredUrlDatum = new ScoredUrlDatum(url, str, UrlStatus.UNFETCHED, this._scorer.generateScore(domain, pld, poll));
                            } else {
                                fetchCounters = FetchCounters.URLS_BLOCKED;
                                scoredUrlDatum = new ScoredUrlDatum(url, GroupingKey.BLOCKED_GROUPING_KEY, UrlStatus.SKIPPED_BLOCKED, 0.0d);
                            }
                            scoredUrlDatum.setPayload(poll.getPayload());
                            this._flowProcess.increment(fetchCounters, 1);
                            synchronized (this._collector) {
                                this._collector.add(scoredUrlDatum.getTuple());
                            }
                        }
                    } else {
                        this._flowProcess.increment(FetchCounters.DOMAINS_SKIPPED, 1);
                        this._flowProcess.increment(FetchCounters.URLS_SKIPPED, this._urls.size());
                        LOGGER.debug("Skipping URLs from not-good domain: " + domain);
                        emptyQueue(this._urls, GroupingKey.SKIPPED_GROUPING_KEY, this._collector);
                    }
                    this._flowProcess.decrement(FetchCounters.DOMAINS_PROCESSING, 1);
                } catch (MalformedURLException e) {
                    LOGGER.debug("Invalid URL: " + this._protocolAndDomain);
                    this._flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
                    this._flowProcess.increment(FetchCounters.URLS_REJECTED, this._urls.size());
                    emptyQueue(this._urls, GroupingKey.INVALID_URL_GROUPING_KEY, this._collector);
                    this._flowProcess.decrement(FetchCounters.DOMAINS_PROCESSING, 1);
                } catch (URISyntaxException e2) {
                    LOGGER.debug("Invalid URI: " + this._protocolAndDomain);
                    this._flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
                    this._flowProcess.increment(FetchCounters.URLS_REJECTED, this._urls.size());
                    emptyQueue(this._urls, GroupingKey.INVALID_URL_GROUPING_KEY, this._collector);
                    this._flowProcess.decrement(FetchCounters.DOMAINS_PROCESSING, 1);
                }
            } catch (UnknownHostException e3) {
                LOGGER.debug("Unknown host: " + this._protocolAndDomain);
                this._flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
                this._flowProcess.increment(FetchCounters.URLS_REJECTED, this._urls.size());
                emptyQueue(this._urls, GroupingKey.UNKNOWN_HOST_GROUPING_KEY, this._collector);
                this._flowProcess.decrement(FetchCounters.DOMAINS_PROCESSING, 1);
            } catch (Exception e4) {
                LOGGER.warn("Exception processing " + this._protocolAndDomain, e4);
                this._flowProcess.increment(FetchCounters.DOMAINS_REJECTED, 1);
                this._flowProcess.increment(FetchCounters.URLS_REJECTED, this._urls.size());
                emptyQueue(this._urls, GroupingKey.INVALID_URL_GROUPING_KEY, this._collector);
                this._flowProcess.decrement(FetchCounters.DOMAINS_PROCESSING, 1);
            }
        } catch (Throwable th) {
            this._flowProcess.decrement(FetchCounters.DOMAINS_PROCESSING, 1);
            throw th;
        }
    }
}
