package gr.ilsp.fmc.exporter.pdf;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
import org.jdom.JDOMException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:gr/ilsp/fmc/exporter/pdf/PDFCorpus.class */
public class PDFCorpus {
    protected static Matcher skipRuleM = Pattern.compile("^(\\s*)||(#.*)$").matcher("");
    private static final Logger logger = LoggerFactory.getLogger(PDFCorpus.class);
    String inCharsetName = "UTF-8";
    public String topWebDomain;
    public String domain;
    public String genre;
    public String project;
    public String projectWebSite;
    public String linguality;
    public List<String> languages;
    public String crawlDate;
    public File corpusDir;
    public File corpusDescriptionFile;
    public File outputDir;

    public static void main(String[] strArr) throws IOException, JDOMException {
        PDFCorpus pDFCorpus = new PDFCorpus();
        pDFCorpus.setCorpusDir(new File(strArr[0]));
        pDFCorpus.setCorpusDescriptionFile(new File(strArr[1]));
        pDFCorpus.setOutputDir(new File(strArr[2]));
        ArrayList arrayList = new ArrayList();
        arrayList.add("pt");
        arrayList.add(WikipediaTokenizer.EXTERNAL_LINK);
        pDFCorpus.setLanguages(arrayList);
        pDFCorpus.setCrawlDate(new SimpleDateFormat("yyyy").format(Long.valueOf(pDFCorpus.getCorpusDir().lastModified())));
        pDFCorpus.setGenre("Information");
        pDFCorpus.setTopWebDomain("http://www.mhcs.health.nsw.gov.au");
        pDFCorpus.setDomain("Medical");
        pDFCorpus.setProject("QTLP");
        pDFCorpus.setProjectWebSite("http://www.qt21.eu");
        if (arrayList.size() > 1) {
            pDFCorpus.setLinguality("multilingual");
        } else {
            pDFCorpus.setLinguality("monolingual");
        }
        pDFCorpus.processPdfFiles(pDFCorpus.getCorpusDir(), pDFCorpus.getCorpusDescriptionFile());
    }

    private void processPdfFiles(File file, File file2) throws IOException, JDOMException {
        logger.info("Reading: " + file2);
        BufferedReader bufferedReader = null;
        try {
            try {
                try {
                    bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(file2), this.inCharsetName));
                    if (this.languages.size() > 1) {
                        while (true) {
                            String readLine = bufferedReader.readLine();
                            if (readLine == null) {
                                break;
                            }
                            if (!skipRuleM.reset(readLine).matches()) {
                                String[] split = readLine.split("\\s+");
                                File file3 = new File(FilenameUtils.concat(file.getAbsolutePath(), new File(split[0]).getName()));
                                File file4 = new File(FilenameUtils.concat(file.getAbsolutePath(), new File(split[1]).getName()));
                                PDFPair pDFPair = new PDFPair();
                                pDFPair.setTextFile1(file3);
                                pDFPair.setTextFile2(file4);
                                pDFPair.setPdfFile1(new File(StringUtils.replace(file3.getAbsolutePath(), ".pdf.txt", ".pdf")));
                                pDFPair.setPdfFile2(new File(StringUtils.replace(file4.getAbsolutePath(), ".pdf.txt", ".pdf")));
                                pDFPair.setLanguage1(this.languages.get(0));
                                pDFPair.setLanguage2(this.languages.get(1));
                                pDFPair.setCrawlDate(this.crawlDate);
                                pDFPair.setGenre(this.genre);
                                pDFPair.setTopWebDomain(this.topWebDomain);
                                pDFPair.setDomain(this.domain);
                                pDFPair.setProject(this.project);
                                pDFPair.setProjectWebSite(this.projectWebSite);
                                pDFPair.setCorpusDir(file);
                                pDFPair.setOutputDir(this.outputDir);
                                pDFPair.export();
                            }
                        }
                    }
                    if (bufferedReader != null) {
                        bufferedReader.close();
                    }
                } catch (IOException e) {
                    logger.error(e.toString());
                    if (bufferedReader != null) {
                        bufferedReader.close();
                    }
                }
            } catch (FileNotFoundException e2) {
                logger.error(e2.toString());
                if (bufferedReader != null) {
                    bufferedReader.close();
                }
            }
            logger.info("Finished exporting to " + this.outputDir);
        } catch (Throwable th) {
            if (bufferedReader != null) {
                bufferedReader.close();
            }
            throw th;
        }
    }

    public String getTopWebDomain() {
        return this.topWebDomain;
    }

    public void setTopWebDomain(String str) {
        this.topWebDomain = str;
    }

    public String getDomain() {
        return this.domain;
    }

    public void setDomain(String str) {
        this.domain = str;
    }

    public String getGenre() {
        return this.genre;
    }

    public void setGenre(String str) {
        this.genre = str;
    }

    public String getProject() {
        return this.project;
    }

    public void setProject(String str) {
        this.project = str;
    }

    public String getLinguality() {
        return this.linguality;
    }

    public void setLinguality(String str) {
        this.linguality = str;
    }

    public List<String> getLanguages() {
        return this.languages;
    }

    public void setLanguages(List<String> list) {
        this.languages = list;
    }

    public String getCrawlDate() {
        return this.crawlDate;
    }

    public void setCrawlDate(String str) {
        this.crawlDate = str;
    }

    public File getCorpusDir() {
        return this.corpusDir;
    }

    public void setCorpusDir(File file) {
        this.corpusDir = file;
    }

    public File getCorpusDescriptionFile() {
        return this.corpusDescriptionFile;
    }

    public void setCorpusDescriptionFile(File file) {
        this.corpusDescriptionFile = file;
    }

    public File getOutputDir() {
        return this.outputDir;
    }

    public void setOutputDir(File file) {
        this.outputDir = file;
    }

    public String getProjectWebSite() {
        return this.projectWebSite;
    }

    public void setProjectWebSite(String str) {
        this.projectWebSite = str;
    }
}
