package gr.ilsp.fmc.main;

import bixo.datum.UrlStatus;
import bixo.urls.SimpleUrlNormalizer;
import cascading.scheme.Scheme;
import cascading.scheme.SequenceFile;
import cascading.tap.Hfs;
import cascading.tuple.TupleEntryCollector;
import gr.ilsp.fmc.datums.CrawlDbDatum;
import gr.ilsp.fmc.exporter.SampleExporter;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.configuration.CompositeConfiguration;
import org.apache.commons.configuration.ConfigurationException;
import org.apache.commons.configuration.XMLConfiguration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.FileAppender;
import org.apache.log4j.Logger;
import org.apache.log4j.PatternLayout;
import org.hsqldb.DatabaseURL;

/* loaded from: input_file:gr/ilsp/fmc/main/SimpleCrawlHFS.class */
public class SimpleCrawlHFS {
    private static ArrayList<String[]> topic;
    private static String[] classes;
    public static CompositeConfiguration config;
    private static final String resultXMLDir = "xml";
    private static final String resultTMXDir = "tmx";
    private static final String tempFileExt = ".xml.txt";
    private static final Logger LOGGER = Logger.getLogger(SimpleCrawlHFS.class);
    private static int PAGES_STORED = 0;
    private static int PAGES_FAILED_CLASSIFICATION = 0;
    private static int PAGES_VISITED = 0;
    private static int TOKENS_STORED = 0;
    private static int TOKENS_TARGET = 100000000;
    private static String fs1 = System.getProperty("file.separator");
    private static String lang_separator = ";";
    public static JobConf conf = null;
    private static String operation = "crawlandexport";
    protected static Matcher skipLineM = Pattern.compile("^(\\s*)||(#.*)$").matcher("");

    private static void setLoopLoggerFile(String str, int i) {
        Logger rootLogger = Logger.getRootLogger();
        String format = String.format("%s/%d-JDBCCrawlTool.log", str, Integer.valueOf(i));
        FileAppender fileAppender = (FileAppender) rootLogger.getAppender("loop-logger");
        if (fileAppender != null) {
            fileAppender.setFile(format);
            fileAppender.activateOptions();
            return;
        }
        FileAppender fileAppender2 = new FileAppender();
        fileAppender2.setName("loop-logger");
        fileAppender2.setLayout(new PatternLayout("%d{yy/MM/dd HH:mm:ss} %p %c{2}:%L - %m%n"));
        fileAppender2.setFile(format);
        fileAppender2.activateOptions();
        rootLogger.addAppender(fileAppender2);
    }

    private static void importOneDomain(String str, Path path, JobConf jobConf) throws IOException {
        try {
            TupleEntryCollector openForWrite = new Hfs((Scheme) new SequenceFile(CrawlDbDatum.FIELDS), path.toUri().toString(), true).openForWrite(jobConf);
            CrawlDbDatum crawlDbDatum = new CrawlDbDatum(new SimpleUrlNormalizer().normalize(DatabaseURL.S_HTTP + str), 0L, 0L, UrlStatus.UNFETCHED, 0, 0.0d);
            openForWrite.add(crawlDbDatum.getTuple());
            openForWrite.close();
            LOGGER.info("Added domain: " + crawlDbDatum.getUrl());
        } catch (IOException e) {
            throw e;
        }
    }

    private static void importUrlList(String str, Path path, JobConf jobConf) throws IOException {
        try {
            TupleEntryCollector openForWrite = new Hfs((Scheme) new SequenceFile(CrawlDbDatum.FIELDS), path.toUri().toString(), true).openForWrite(jobConf);
            SimpleUrlNormalizer simpleUrlNormalizer = new SimpleUrlNormalizer();
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(str), "utf8"));
            ArrayList arrayList = new ArrayList();
            while (true) {
                String readLine = bufferedReader.readLine();
                String str2 = readLine;
                if (readLine == null) {
                    LOGGER.info("Starting from " + arrayList.size() + " URLs");
                    bufferedReader.close();
                    openForWrite.close();
                    return;
                }
                if (!skipLineM.reset(str2).matches()) {
                    byte[] bytes = str2.trim().getBytes("UTF-8");
                    if (bytes[0] == -17 && bytes[1] == -69 && bytes[2] == -65) {
                        byte[] bArr = new byte[bytes.length - 3];
                        for (int i = 3; i < bytes.length; i++) {
                            bArr[i - 3] = bytes[i];
                        }
                        str2 = new String(bArr);
                    }
                    if (!arrayList.contains(str2)) {
                        arrayList.add(str2);
                        if (!str2.equals("") && !str2.startsWith("ftp") && !str2.equals(DatabaseURL.S_HTTP)) {
                            openForWrite.add(new CrawlDbDatum(simpleUrlNormalizer.normalize(str2), 0L, 0L, UrlStatus.UNFETCHED, 0, 0.0d).getTuple());
                        }
                    }
                }
            }
        } catch (IOException e) {
            throw e;
        }
    }

    private static void importURLOneDomain(String str, Path path, JobConf jobConf) throws IOException {
        try {
            TupleEntryCollector openForWrite = new Hfs((Scheme) new SequenceFile(CrawlDbDatum.FIELDS), path.toUri().toString(), true).openForWrite(jobConf);
            SimpleUrlNormalizer simpleUrlNormalizer = new SimpleUrlNormalizer();
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(str), "utf8"));
            ArrayList arrayList = new ArrayList();
            while (true) {
                String readLine = bufferedReader.readLine();
                String str2 = readLine;
                if (readLine == null) {
                    LOGGER.info("Starting from " + arrayList.size() + " URLs");
                    bufferedReader.close();
                    openForWrite.close();
                    return;
                }
                if (!skipLineM.reset(str2).matches()) {
                    byte[] bytes = str2.getBytes("UTF-8");
                    if (bytes[0] == -17 && bytes[1] == -69 && bytes[2] == -65) {
                        byte[] bArr = new byte[bytes.length - 3];
                        for (int i = 3; i < bytes.length; i++) {
                            bArr[i - 3] = bytes[i];
                        }
                        str2 = new String(bArr);
                    }
                    if (!arrayList.contains(str2)) {
                        arrayList.add(str2);
                        if (!str2.equals("") && !str2.startsWith("ftp") && !str2.equals(DatabaseURL.S_HTTP)) {
                            openForWrite.add(new CrawlDbDatum(simpleUrlNormalizer.normalize(str2), 0L, 0L, UrlStatus.UNFETCHED, 0, 0.0d).getTuple());
                        }
                    }
                }
            }
        } catch (IOException e) {
            throw e;
        }
    }

    public static void main(String[] strArr) {
        if (strArr.length == 0) {
            LOGGER.info("Usage: SimpleCrawlHFS [crawl|export|config]");
            System.exit(-1);
        }
        if (helpAsked(strArr)) {
            try {
                crawl(strArr);
            } catch (IOException e) {
                LOGGER.error(e.getMessage());
            }
        }
        operation = strArr[0].toLowerCase();
        if (operation.equals("export")) {
            SampleExporter.main(strArr);
            return;
        }
        if (operation.equals("config")) {
            if (strArr.length <= 1) {
                LOGGER.error("Usage: SimpleCrawlHFS config <file to save config xml>");
                return;
            }
            String str = strArr[1];
            try {
                new XMLConfiguration(SimpleCrawlHFS.class.getClassLoader().getResource("crawler_config.xml")).save(str);
                LOGGER.info("Saved default config file at " + str);
                return;
            } catch (ConfigurationException e2) {
                LOGGER.error("Couldn't save file " + str);
                return;
            }
        }
        if (!operation.equals("crawl") && !operation.equals("crawlandexport")) {
            LOGGER.error("Invalid operation.");
            System.exit(-1);
        } else {
            try {
                crawl(strArr);
            } catch (IOException e3) {
                LOGGER.error(e3.getMessage());
            }
        }
    }

    private static boolean helpAsked(String[] strArr) {
        for (int i = 0; i < strArr.length; i++) {
            if ((strArr[i].equals("-h") | strArr[i].equals("-help") | strArr[i].equals("--help")) || strArr[i].equals("--h")) {
                return true;
            }
        }
        return false;
    }

    /* JADX WARN: Code restructure failed: missing block: B:105:0x06ed, code lost:
    
        gr.ilsp.fmc.main.SimpleCrawlHFS.LOGGER.info("Time expired or target tokens amount reached, ending crawl.");
        r0 = java.lang.System.currentTimeMillis() - r0;
        gr.ilsp.fmc.main.SimpleCrawlHFS.LOGGER.info("Made " + ((r51 - r0) - 1) + " runs in " + (java.lang.System.currentTimeMillis() - r0) + " milliseconds.");
        gr.ilsp.fmc.main.SimpleCrawlHFS.LOGGER.info("Total pages stored/visited: " + gr.ilsp.fmc.main.SimpleCrawlHFS.PAGES_STORED + "/" + gr.ilsp.fmc.main.SimpleCrawlHFS.PAGES_VISITED);
        gr.ilsp.fmc.main.SimpleCrawlHFS.LOGGER.info("Total pages failed classification or are too short: " + gr.ilsp.fmc.main.SimpleCrawlHFS.PAGES_FAILED_CLASSIFICATION);
        gr.ilsp.fmc.main.SimpleCrawlHFS.LOGGER.info("Total tokens stored: " + gr.ilsp.fmc.main.SimpleCrawlHFS.TOKENS_STORED);
        gr.ilsp.fmc.main.SimpleCrawlHFS.LOGGER.info("Average run time: " + (((float) r0) / ((r51 - r0) - 1)) + " milliseconds.");
     */
    /*
        Code decompiled incorrectly, please refer to instructions dump.
        To view partially-correct add '--show-bad-code' argument
    */
    private static void crawl(java.lang.String[] r15) throws java.io.IOException {
        /*
            Method dump skipped, instructions count: 5206
            To view this dump add '--comments-level debug' option
        */
        throw new UnsupportedOperationException("Method not decompiled: gr.ilsp.fmc.main.SimpleCrawlHFS.crawl(java.lang.String[]):void");
    }

    public static String getRunningJarPath() {
        String str = "";
        try {
            str = new File(URLDecoder.decode(SimpleCrawlHFS.class.getProtectionDomain().getCodeSource().getLocation().getPath(), "UTF-8")).getParent();
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        return str;
    }

    private static void renamePaths(String str, String str2, String str3, String str4, String str5, String str6, String str7) {
        String replace = new File(new File(str2).getParent()).getParent().replace("\\", "/");
        if (str2.replace("\\", "/").contains(replace)) {
            try {
                String replace2 = ReadResources.readFileAsString(str3).replace("\\", "/");
                if (str7 != null) {
                    replace2 = replace2.replace(replace, str7.trim());
                }
                ReadResources.writetextfile(str3, replace2.replace(DatabaseURL.S_FILE, "").replace("\\", "/"));
                if (str4 != null) {
                    String replace3 = ReadResources.readFileAsString(str4).replace("\\", "/");
                    if (str7 != null) {
                        replace3 = replace3.replace(replace, str7.trim());
                    }
                    ReadResources.writetextfile(str4, replace3.replace(DatabaseURL.S_FILE, "").replace("\\", "/"));
                }
                if (str5 != null) {
                    String replace4 = ReadResources.readFileAsString(str5).replace("\\", "/");
                    if (str7 != null) {
                        replace4 = replace4.replace(replace, str7.trim());
                    }
                    ReadResources.writetextfile(str5, replace4.replace(DatabaseURL.S_FILE, "").replace("\\", "/"));
                }
                if (str6 != null) {
                    String replace5 = ReadResources.readFileAsString(str6).replace("\\", "/");
                    if (str7 != null) {
                        replace5 = replace5.replace(replace, str7.trim());
                    }
                    ReadResources.writetextfile(str6, replace5.replace(DatabaseURL.S_FILE, "").replace("\\", "/"));
                }
                File file = new File(String.valueOf(str2) + fs1 + "xml");
                File[] listFiles = file.listFiles();
                for (int i = 0; i < listFiles.length; i++) {
                    String name = listFiles[i].getName();
                    if (name.endsWith("xml") && name.contains("_")) {
                        String replace6 = ReadResources.readFileAsString(listFiles[i].getAbsolutePath()).replace("trans.loc=\"", "trans.loc=\"" + file.getAbsolutePath() + fs1).replace("\\", "/");
                        if (str7 != null) {
                            replace6 = replace6.replace(replace, str7.trim());
                        }
                        ReadResources.writetextfile(listFiles[i].getAbsolutePath(), replace6.replace(DatabaseURL.S_FILE, "").replace("\\", "/"));
                    }
                }
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    private static boolean check_evol1(ArrayList<int[]> arrayList) {
        boolean z = false;
        if (arrayList.size() == 1 && arrayList.get(0)[1] == 0) {
            z = true;
        }
        return z;
    }

    private static boolean check_evol(ArrayList<int[]> arrayList) {
        boolean z = false;
        int size = arrayList.size();
        if (size > 1 && arrayList.get(size - 1)[1] == arrayList.get(size - 2)[1]) {
            z = true;
        }
        return z;
    }

    public static void incrementPagesStored() {
        PAGES_STORED++;
    }

    public static void incrementPagesVisited() {
        PAGES_VISITED++;
    }

    public static int incrementTokensStored(Double d) {
        TOKENS_STORED = (int) (TOKENS_STORED + d.doubleValue());
        return TOKENS_STORED;
    }

    public static void incrementPagesCutByClassifier() {
        PAGES_FAILED_CLASSIFICATION++;
    }
}
