package gr.ilsp.fmc.main;

import bixo.config.FetcherPolicy;
import bixo.config.UserAgent;
import bixo.datum.UrlStatus;
import bixo.urls.SimpleUrlNormalizer;
import bixo.utils.CrawlDirUtils;
import cascading.flow.PlannerException;
import cascading.tap.Tap;
import cascading.tuple.TupleEntryCollector;
import gr.ilsp.fmc.datums.CrawlDbDatum;
import gr.ilsp.fmc.mysql.MYSQLTapFactory;
import gr.ilsp.fmc.mysql.MYSQLTools;
import gr.ilsp.fmc.parser.DomainUrlFilter;
import gr.ilsp.fmc.utils.TopicTools;
import gr.ilsp.fmc.workflows.SimpleCrawlWorkflow;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.HashSet;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.XMLConfiguration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.FileAppender;
import org.apache.log4j.Logger;
import org.apache.log4j.PatternLayout;
import org.apache.tools.mail.MailMessage;
import org.hsqldb.DatabaseURL;

/* loaded from: input_file:gr/ilsp/fmc/main/SimpleCrawl.class */
public class SimpleCrawl {
    private static ArrayList<String[]> topic;
    private static String[] classes;
    private static final Logger LOGGER = Logger.getLogger(SimpleCrawl.class);
    public static XMLConfiguration config;

    private static void setLoopLoggerFile(String str, int i) {
        Logger rootLogger = Logger.getRootLogger();
        String format = String.format("%s/%d-JDBCCrawlTool.log", str, Integer.valueOf(i));
        FileAppender fileAppender = (FileAppender) rootLogger.getAppender("loop-logger");
        if (fileAppender != null) {
            fileAppender.setFile(format);
            fileAppender.activateOptions();
            return;
        }
        FileAppender fileAppender2 = new FileAppender();
        fileAppender2.setName("loop-logger");
        fileAppender2.setLayout(new PatternLayout("%d{yy/MM/dd HH:mm:ss} %p %c{2}:%L - %m%n"));
        fileAppender2.setFile(format);
        fileAppender2.activateOptions();
        rootLogger.addAppender(fileAppender2);
    }

    private static void importOneDomain(String str, Tap tap, JobConf jobConf) throws IOException {
        try {
            TupleEntryCollector openForWrite = tap.openForWrite(jobConf);
            openForWrite.add(new CrawlDbDatum(new SimpleUrlNormalizer().normalize(DatabaseURL.S_HTTP + str), 0L, 0L, UrlStatus.UNFETCHED, 0, 0.0d).getTuple());
            openForWrite.close();
        } catch (IOException e) {
            throw e;
        }
    }

    private static void importUrlList(String str, Tap tap, JobConf jobConf) throws IOException {
        try {
            TupleEntryCollector openForWrite = tap.openForWrite(jobConf);
            SimpleUrlNormalizer simpleUrlNormalizer = new SimpleUrlNormalizer();
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(str), "utf8"));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    openForWrite.close();
                    return;
                }
                openForWrite.add(new CrawlDbDatum(simpleUrlNormalizer.normalize(readLine), 0L, 0L, UrlStatus.UNFETCHED, 0, 0.0d).getTuple());
            }
        } catch (IOException e) {
            throw e;
        }
    }

    public static void main(String[] strArr) {
        SimpleCrawlOptions simpleCrawlOptions = new SimpleCrawlOptions();
        simpleCrawlOptions.parseOptions(strArr);
        boolean z = false;
        URL resource = SimpleCrawl.class.getClassLoader().getResource("crawler_config.xml");
        config = new XMLConfiguration();
        config.setURL(resource);
        try {
            config.load();
        } catch (ConfigurationException e) {
            e.printStackTrace();
        }
        int i = config.getInt("classifier.min_unique_content_terms.value");
        int i2 = config.getInt("classifier.max_depth.value");
        String domain = simpleCrawlOptions.getDomain();
        int tokensNumber = simpleCrawlOptions.getTokensNumber();
        String str = null;
        if (domain == null) {
            str = simpleCrawlOptions.getUrls();
        } else if (!domain.equals(MailMessage.DEFAULT_HOST) && domain.split("\\.").length < 2) {
            LOGGER.error("The target domain should be a valid paid-level domain or subdomain of the same: " + domain);
            simpleCrawlOptions.help();
        }
        try {
            z = MYSQLTools.initializeDB(simpleCrawlOptions.getDbName(), simpleCrawlOptions.getDbHost());
        } catch (SQLException e2) {
            e2.printStackTrace();
            LOGGER.error("Error while initializing database " + simpleCrawlOptions.getDbName());
            System.exit(64);
        }
        topic = TopicTools.analyzeTopic(simpleCrawlOptions.getTopic(), simpleCrawlOptions.getLanguage(), null);
        classes = TopicTools.findSubclasses(topic);
        double calculateThreshold = TopicTools.calculateThreshold(topic, 5);
        double d = SimpleCrawlHFS.config.getDouble("classifier.relative_relevance_threshold.value");
        String outputDir = simpleCrawlOptions.getOutputDir();
        if (simpleCrawlOptions.isDebug()) {
            System.setProperty("fmc.root.level", "DEBUG");
        } else {
            System.setProperty("fmc.root.level", "INFO");
        }
        if (simpleCrawlOptions.getLoggingAppender() != null) {
            System.setProperty("fmc.appender", simpleCrawlOptions.getLoggingAppender());
        }
        try {
            JobConf jobConf = new JobConf();
            jobConf.setJarByClass(SimpleCrawl.class);
            Path path = new Path(outputDir);
            FileSystem fileSystem = path.getFileSystem(jobConf);
            if ((simpleCrawlOptions.getDbHost() == null || z) && fileSystem.exists(path)) {
                System.out.println("Warning: Previous cycle output dirs exist in : " + outputDir);
                System.out.println("Warning: Delete the output dir before running");
                fileSystem.delete(path, true);
            }
            if (!fileSystem.exists(path)) {
                fileSystem.mkdirs(path);
                setLoopLoggerFile(CrawlDirUtils.makeLoopDir(fileSystem, path, 0).toUri().toString(), 0);
                if (domain != null) {
                    importOneDomain(domain, MYSQLTapFactory.createUrlsSinkJDBCTap(simpleCrawlOptions.getDbHost(), simpleCrawlOptions.getDbName()), jobConf);
                } else {
                    importUrlList(str, MYSQLTapFactory.createUrlsSinkJDBCTap(simpleCrawlOptions.getDbHost(), simpleCrawlOptions.getDbName()), jobConf);
                }
            }
            Path findLatestLoopDir = CrawlDirUtils.findLatestLoopDir(fileSystem, path);
            if (findLatestLoopDir == null) {
                System.err.println("No previous cycle output dirs exist in " + outputDir);
                simpleCrawlOptions.help();
            }
            int extractLoopNumber = CrawlDirUtils.extractLoopNumber(findLatestLoopDir);
            int numLoops = extractLoopNumber + simpleCrawlOptions.getNumLoops();
            UserAgent userAgent = new UserAgent(simpleCrawlOptions.getAgentName(), config.getString("agent.email"), config.getString("agent.web_address"));
            FetcherPolicy fetcherPolicy = new FetcherPolicy();
            fetcherPolicy.setCrawlDelay(config.getLong("fetcher.crawl_delay.value"));
            fetcherPolicy.setMaxContentSize(config.getInt("fetcher.max_content_size.value"));
            fetcherPolicy.setFetcherMode(FetcherPolicy.FetcherMode.EFFICIENT);
            fetcherPolicy.setMaxRequestsPerConnection(config.getInt("fetcher.max_requests_per_run.value"));
            fetcherPolicy.setMaxConnectionsPerHost(config.getInt("fetcher.max_connections_per_host.value"));
            String[] stringArray = config.getStringArray("fetcher.valid_mime_types.mime_type[@value]");
            HashSet hashSet = new HashSet();
            for (String str2 : stringArray) {
                hashSet.add(str2);
            }
            fetcherPolicy.setValidMimeTypes(hashSet);
            int crawlDuration = simpleCrawlOptions.getCrawlDuration();
            boolean z2 = crawlDuration != SimpleCrawlOptions.NO_CRAWL_DURATION;
            long currentTimeMillis = z2 ? System.currentTimeMillis() + (crawlDuration * 60000) : Long.MAX_VALUE;
            DomainUrlFilter domainUrlFilter = new DomainUrlFilter(domain);
            for (int i3 = extractLoopNumber + 1; i3 <= numLoops; i3++) {
                if (z2) {
                    int i4 = (numLoops - i3) + 1;
                    long currentTimeMillis2 = System.currentTimeMillis();
                    fetcherPolicy.setCrawlEndTime(currentTimeMillis2 + ((currentTimeMillis - currentTimeMillis2) / i4));
                }
                Path makeLoopDir = CrawlDirUtils.makeLoopDir(fileSystem, path, i3);
                setLoopLoggerFile(makeLoopDir.toUri().toString(), i3);
                SimpleCrawlWorkflow.createFlow(findLatestLoopDir, makeLoopDir, userAgent, fetcherPolicy, domainUrlFilter, simpleCrawlOptions.getThreads(), simpleCrawlOptions.isDebug(), simpleCrawlOptions.getDbHost(), simpleCrawlOptions.getDbName(), simpleCrawlOptions.getLanguage(), simpleCrawlOptions.getLangKeys(), classes, topic, calculateThreshold, d, i, i2, tokensNumber).complete();
                findLatestLoopDir = makeLoopDir;
            }
        } catch (PlannerException e3) {
            e3.writeDOT("build/failed-flow.dot");
            System.err.println("PlannerException: " + e3.getMessage());
            e3.printStackTrace(System.err);
            System.exit(-1);
        } catch (Throwable th) {
            System.err.println("Exception running tool: " + th.getMessage());
            th.printStackTrace(System.err);
            System.exit(-1);
        }
        MYSQLTapFactory.shutdown();
    }
}
