package gr.ilsp.fmc.exporter;

import bixo.datum.FetchedDatum;
import bixo.datum.UrlStatus;
import bixo.utils.CrawlDirUtils;
import cascading.scheme.SequenceFile;
import cascading.scheme.TextLine;
import cascading.tap.Hfs;
import cascading.tuple.TupleEntryIterator;
import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
import gr.ilsp.fmc.datums.ClassifierDatum;
import gr.ilsp.fmc.datums.CrawlDbDatum;
import gr.ilsp.fmc.datums.ExtendedParsedDatum;
import gr.ilsp.fmc.main.ReadResources;
import gr.ilsp.fmc.main.SimpleCrawlHFS;
import gr.ilsp.fmc.utils.AnalyzerFactory;
import gr.ilsp.fmc.utils.ContentNormalizer;
import gr.ilsp.fmc.utils.CrawlConfig;
import gr.ilsp.fmc.utils.DirUtils;
import gr.ilsp.fmc.utils.JarUtils;
import gr.ilsp.fmc.utils.LithuanianAnalyzer;
import gr.ilsp.fmc.utils.PrettyPrintHandler;
import gr.ilsp.fmc.utils.TopicTools;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.lang.reflect.Proxy;
import java.net.MalformedURLException;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.XMLStreamWriter;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.http.cookie.ClientCookie;
import org.apache.jasper.compiler.TagConstants;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.tika.metadata.CreativeCommons;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.MSOffice;
import org.apache.tools.ant.types.selectors.TypeSelector;
import org.apache.xmlbeans.impl.jam.xml.JamXmlElements;
import org.codehaus.stax2.XMLOutputFactory2;
import org.codehaus.stax2.XMLStreamWriter2;

/* loaded from: input_file:gr/ilsp/fmc/exporter/SampleExporter.class */
public class SampleExporter {
    private static final String cesDocVersion = "0.4";
    private static int MIN_TOKENS_PER_PARAGRAPH;
    private static int MIN_TOKENS_NUMBER;
    private static String crawlDirName;
    private static String language;
    private static String topic;
    private static String negWordsFile;
    private static String[] mimetypes;
    private static String targeteddomain;
    private static URL genres;
    private static final String text_cc_separ = ";";
    private static final Logger LOGGER = Logger.getLogger(SampleExporter.class);
    private static String cesNameSpace = "http://www.w3.org/1999/xlink";
    private static String cesNameSpace1 = "http://www.xces.org/schema/2003";
    private static String cesNameSpace2 = "http://www.w3.org/2001/XMLSchema-instance";
    private static String year = Integer.toString(Calendar.getInstance().get(1));
    private static String outputDir = "";
    private static boolean textExport = false;
    private static boolean applyOfflineXSLT = false;
    private static XSLTransformer xslTransformer = null;
    private static boolean cesdoc = false;
    private static boolean html = false;
    private static SampleExporterOptions options = null;
    static Analyzer analyzer = null;
    static AnalyzerFactory analyzerFactory = new AnalyzerFactory();
    private static ArrayList<String> topicTermsAll = null;
    private static HashMap<String, String> genres_keys = null;
    private static ArrayList<File> xmlFiles = new ArrayList<>();
    private static String outputFile = null;
    private static String outputFileHTML = null;
    private static String researchProject = "ILSP";

    private static void processStatus(JobConf jobConf, Path path) throws IOException {
        TupleEntryIterator openForRead = new Hfs(new TextLine(), new Path(path, CrawlConfig.STATUS_SUBDIR_NAME).toUri().toString()).openForRead(jobConf);
        UrlStatus[] values = UrlStatus.values();
        int[] iArr = new int[values.length];
        int i = 0;
        while (openForRead.hasNext()) {
            i++;
            int ordinal = UrlStatus.valueOf(openForRead.next().getString(JamXmlElements.LINE).split("\t")[0]).ordinal();
            iArr[ordinal] = iArr[ordinal] + 1;
        }
        for (int i2 = 0; i2 < iArr.length; i2++) {
            if (iArr[i2] != 0) {
                LOGGER.info(String.format("Status %s: %d", values[i2].toString(), Integer.valueOf(iArr[i2])));
            }
        }
        LOGGER.info("Total status: " + i);
        LOGGER.info("");
    }

    private static void processCrawlDb(JobConf jobConf, Path path, boolean z) throws IOException {
        new Hfs(new SequenceFile(CrawlDbDatum.FIELDS), new Path(path, CrawlConfig.CRAWLDB_SUBDIR_NAME).toUri().toString()).openForRead(jobConf);
        int i = -1;
        Path parent = path.getParent();
        FileSystem fileSystem = parent.getFileSystem(jobConf);
        while (true) {
            Path findNextLoopDir = CrawlDirUtils.findNextLoopDir(fileSystem, parent, i);
            if (findNextLoopDir == null) {
                break;
            }
            int extractLoopNumber = CrawlDirUtils.extractLoopNumber(findNextLoopDir);
            if (extractLoopNumber != i + 1) {
                LOGGER.warn(String.format("Missing directories between %d and %d", Integer.valueOf(i), Integer.valueOf(extractLoopNumber)));
            }
            new Hfs(new SequenceFile(ClassifierDatum.FIELDS), new Path(findNextLoopDir, CrawlConfig.CLASSIFIER_SUBDIR_NAME).toUri().toString()).openForRead(jobConf);
            i = extractLoopNumber;
        }
        if (z) {
            return;
        }
        LOGGER.info(String.format("%d fetched URLs", 0));
        LOGGER.info(String.format("%d unfetched URLs", 0));
        LOGGER.info("Total URLs: 0");
        LOGGER.info("");
    }

    public void export(boolean z) {
        JobConf jobConf;
        Path path;
        FileSystem fileSystem;
        long currentTimeMillis = System.currentTimeMillis();
        try {
            jobConf = new JobConf();
            path = new Path(crawlDirName);
            fileSystem = path.getFileSystem(jobConf);
            if (!fileSystem.exists(path)) {
                System.err.println("Prior crawl output directory does not exist: " + crawlDirName);
                System.exit(-1);
            }
        } catch (Throwable th) {
            LOGGER.error("Exception running tool", th);
            System.exit(-1);
            return;
        }
        if (z) {
            URL resource = SimpleCrawlHFS.class.getResource("/profiles");
            LOGGER.debug(resource);
            if (resource.getProtocol() == "jar") {
                File createTempDir = DirUtils.createTempDir();
                LOGGER.debug(createTempDir);
                JarUtils.copyResourcesRecursively(resource, createTempDir);
                try {
                    DetectorFactory.loadProfile(createTempDir);
                } catch (LangDetectException e) {
                    LOGGER.error(e.getMessage());
                }
            } else {
                try {
                    try {
                        DetectorFactory.loadProfile(new File(resource.toURI()));
                    } catch (URISyntaxException e2) {
                        LOGGER.error(e2.getMessage());
                    }
                } catch (LangDetectException e3) {
                    LOGGER.error(e3.getMessage());
                }
            }
            LOGGER.error("Exception running tool", th);
            System.exit(-1);
            return;
        }
        Logger.getRootLogger().setLevel(Level.INFO);
        if (1 != 0) {
            int i = -1;
            int i2 = 1;
            String topic2 = getTopic();
            ArrayList<String[]> arrayList = null;
            if (topic2 != null) {
                arrayList = TopicTools.analyzeTopic(topic2, language, jobConf);
                topicTermsAll = TopicTools.analyzeTopicALL(arrayList);
            }
            while (true) {
                Path findNextLoopDir = CrawlDirUtils.findNextLoopDir(fileSystem, path, i);
                if (findNextLoopDir == null) {
                    break;
                }
                i2 = exportToXml(jobConf, findNextLoopDir, language, i2, arrayList, targeteddomain);
                int extractLoopNumber = CrawlDirUtils.extractLoopNumber(findNextLoopDir);
                if (extractLoopNumber != i + 1) {
                    LOGGER.warn(String.format("Missing directories between %d and %d", Integer.valueOf(i), Integer.valueOf(extractLoopNumber)));
                }
                i = extractLoopNumber;
            }
            LOGGER.info("CesDoc files generated: " + xmlFiles.size());
            LOGGER.info("Completed in " + (System.currentTimeMillis() - currentTimeMillis) + " milliseconds.");
            OutputStreamWriter outputStreamWriter = new OutputStreamWriter(new FileOutputStream(outputFile), "UTF-8");
            Iterator<File> it = xmlFiles.iterator();
            while (it.hasNext()) {
                outputStreamWriter.write(String.valueOf(it.next().getAbsolutePath().replace("\\", "/")) + "\n");
            }
            outputStreamWriter.close();
            if (html && xmlFiles.size() > 0) {
                OutputStreamWriter outputStreamWriter2 = new OutputStreamWriter(new FileOutputStream(outputFileHTML), "UTF-8");
                outputStreamWriter2.write("<html xmlns=\"http://www.w3.org/1999/xhtml\">\n");
                Iterator<File> it2 = xmlFiles.iterator();
                while (it2.hasNext()) {
                    File next = it2.next();
                    String replace = new File(String.valueOf(next.getAbsolutePath()) + ".html").getAbsolutePath().replace("\\", "/");
                    outputStreamWriter2.write("<br />" + (applyOfflineXSLT ? "<a href=\"" + replace + "\">\n" + replace + "</a>" : "<a href=\"" + next.getAbsolutePath() + "\">\n" + next.getAbsolutePath() + "</a>").replace("\\", "/") + "\n");
                }
                outputStreamWriter2.write("</html>");
                outputStreamWriter2.close();
            }
        }
        if (1 != 0) {
            processCrawlDb(jobConf, CrawlDirUtils.findLatestLoopDir(fileSystem, path), true);
            return;
        }
        int i3 = -1;
        while (true) {
            Path findNextLoopDir2 = CrawlDirUtils.findNextLoopDir(fileSystem, path, i3);
            if (findNextLoopDir2 == null) {
                return;
            }
            String uri = findNextLoopDir2.toUri().toString();
            LOGGER.info("");
            LOGGER.info("================================================================");
            LOGGER.info("Processing " + uri);
            LOGGER.info("================================================================");
            int extractLoopNumber2 = CrawlDirUtils.extractLoopNumber(findNextLoopDir2);
            if (extractLoopNumber2 != i3 + 1) {
                LOGGER.warn(String.format("Missing directories between %d and %d", Integer.valueOf(i3), Integer.valueOf(extractLoopNumber2)));
            }
            i3 = extractLoopNumber2;
            processStatus(jobConf, findNextLoopDir2);
            processCrawlDb(jobConf, findNextLoopDir2, true);
        }
    }

    public static void main(String[] strArr) {
        SampleExporter sampleExporter = new SampleExporter();
        options = new SampleExporterOptions();
        options.parseOptions(strArr);
        sampleExporter.setMIN_TOKENS_PER_PARAGRAPH(options.get_length());
        sampleExporter.setMIN_TOKENS_NUMBER(options.get_minTokenslength());
        sampleExporter.setCrawlDirName(options.get_inputdir());
        sampleExporter.setOutputFile(String.valueOf(options.get_inputdir()) + System.getProperty("file.separator") + "outputlist.txt");
        if (options.get_topic() != null) {
            sampleExporter.setTopic(options.get_topic());
        }
        sampleExporter.setLanguage(options.get_language());
        if (options.get_topic() != null) {
            sampleExporter.setTopic(options.get_topic());
        }
        if (options.get_negwords() != null) {
            sampleExporter.setNegWordsFile(options.get_negwords());
        }
        if (options.get_outputdir() != null) {
            sampleExporter.setOutputDir(options.get_outputdir());
        }
        if (options.get_textexport()) {
            sampleExporter.setTextExport(true);
        }
        if (options.get_style()) {
            sampleExporter.setStyleExport(options.get_style());
        }
        sampleExporter.export(true);
    }

    private static int exportToXml(JobConf jobConf, Path path, String str, int i, ArrayList<String[]> arrayList, String str2) throws IOException {
        Hfs hfs = new Hfs(new SequenceFile(ExtendedParsedDatum.FIELDS), new Path(path, CrawlConfig.PARSE_SUBDIR_NAME).toUri().toString());
        Hfs hfs2 = new Hfs(new SequenceFile(FetchedDatum.FIELDS), new Path(path, CrawlConfig.CONTENT_SUBDIR_NAME).toUri().toString());
        Hfs hfs3 = new Hfs(new SequenceFile(ClassifierDatum.FIELDS), new Path(path, CrawlConfig.CLASSIFIER_SUBDIR_NAME).toUri().toString());
        TupleEntryIterator openForRead = hfs3.openForRead(jobConf);
        TupleEntryIterator openForRead2 = hfs3.openForRead(jobConf);
        Path path2 = outputDir.length() == 0 ? new Path(path.getParent(), "xml") : new Path(outputDir);
        FileSystem fileSystem = path2.getFileSystem(jobConf);
        if (!fileSystem.exists(path2)) {
            fileSystem.mkdirs(path2);
        }
        TupleEntryIterator openForRead3 = hfs2.openForRead(jobConf);
        TupleEntryIterator openForRead4 = hfs.openForRead(jobConf);
        String[] forbiddenwords = getNegWordsFile() != null ? getForbiddenwords(getNegWordsFile()) : null;
        while (openForRead4.hasNext()) {
            ExtendedParsedDatum extendedParsedDatum = new ExtendedParsedDatum(openForRead4.next());
            String url = extendedParsedDatum.getUrl();
            LOGGER.debug("Writing: " + i + ShingleFilter.TOKEN_SEPARATOR + url);
            String title = extendedParsedDatum.getTitle();
            if (title == null) {
                title = "";
            }
            String normalizeText = ContentNormalizer.normalizeText(extendedParsedDatum.getParsedText());
            Map<String, String> parsedMeta = extendedParsedDatum.getParsedMeta();
            String str3 = parsedMeta.get(MSOffice.AUTHOR);
            String str4 = parsedMeta.get("Publisher");
            String str5 = parsedMeta.get("keywords");
            String str6 = parsedMeta.get("comment");
            ArrayList arrayList2 = new ArrayList();
            if (str5 != null) {
                for (String str7 : str5.split(",|;|:")) {
                    arrayList2.add(str7.trim());
                }
            }
            String str8 = parsedMeta.get("Content-Encoding");
            String validFormat = validFormat(parsedMeta.get("Content-Type"));
            if (XMLExporter(path2, validFormat, title, url, str, validFormat.contains("pdf") ? "" : getHtml(url, path, openForRead3, str8), normalizeText, i, "", str3, str4, str2, getSubdomains(url, path, openForRead), arrayList2, arrayList, forbiddenwords, parsedMeta.get(CreativeCommons.LICENSE_URL), "", getRelscore(url, path, openForRead2), str6).booleanValue()) {
                i++;
            }
            if (textExport) {
                TextExporter(path2, normalizeText, i - 1);
            }
        }
        openForRead4.close();
        openForRead.close();
        openForRead3.close();
        return i;
    }

    private static double getRelscore(String str, Path path, TupleEntryIterator tupleEntryIterator) {
        while (tupleEntryIterator.hasNext()) {
            ClassifierDatum classifierDatum = new ClassifierDatum(tupleEntryIterator.next());
            if (classifierDatum.getUrl().equals(str)) {
                double doubleValue = classifierDatum.getTotRelScore().doubleValue();
                if (doubleValue < 1.0d) {
                    return doubleValue;
                }
                return 1.0d;
            }
        }
        return 0.0d;
    }

    private static String getHtml(String str, Path path, TupleEntryIterator tupleEntryIterator, String str2) {
        String str3 = "";
        while (tupleEntryIterator.hasNext()) {
            FetchedDatum fetchedDatum = new FetchedDatum(tupleEntryIterator.next());
            if (fetchedDatum.getUrl().equals(str)) {
                ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(fetchedDatum.getContentBytes(), 0, fetchedDatum.getContentLength());
                try {
                    try {
                        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(byteArrayInputStream, str2));
                        while (true) {
                            String readLine = bufferedReader.readLine();
                            if (readLine == null) {
                                break;
                            }
                            str3 = str3.concat(String.valueOf(readLine) + "\r\n");
                        }
                        bufferedReader.close();
                        try {
                            byteArrayInputStream.close();
                        } catch (IOException e) {
                            e.printStackTrace();
                        }
                    } finally {
                        try {
                            byteArrayInputStream.close();
                        } catch (IOException e2) {
                            e2.printStackTrace();
                        }
                    }
                } catch (UnsupportedEncodingException e3) {
                    e3.printStackTrace();
                } catch (IOException e4) {
                    e4.printStackTrace();
                    try {
                        byteArrayInputStream.close();
                    } catch (IOException e5) {
                        e5.printStackTrace();
                    }
                }
                return str3;
            }
        }
        return null;
    }

    private static String getSubdomains(String str, Path path, TupleEntryIterator tupleEntryIterator) {
        String str2 = "";
        while (tupleEntryIterator.hasNext()) {
            ClassifierDatum classifierDatum = new ClassifierDatum(tupleEntryIterator.next());
            if (classifierDatum.getUrl().equals(str)) {
                for (String str3 : classifierDatum.getSubClasses()) {
                    str2 = str2.concat(String.valueOf(str3) + text_cc_separ);
                }
                return str2 == "" ? str2 : str2.substring(0, str2.length() - 1);
            }
        }
        return null;
    }

    private static String validFormat(String str) {
        String str2 = str;
        if (str.contains(text_cc_separ)) {
            str2 = str.split(text_cc_separ)[0];
        }
        return str2;
    }

    public static String[] getForbiddenwords(String str) {
        File file = new File(str);
        if (!file.exists()) {
            return null;
        }
        ArrayList arrayList = new ArrayList();
        try {
            BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
            int i = 0;
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    String[] strArr = new String[i];
                    System.arraycopy(arrayList.toArray(), 0, strArr, 0, i);
                    return strArr;
                }
                arrayList.add(readLine);
                i++;
            }
        } catch (IOException e) {
            System.err.println("Problem in reading the file with the forbidden words");
            return null;
        }
    }

    public static void TextExporter(Path path, String str, int i) {
        try {
            BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new Path(path, String.valueOf(i) + ".txt").toUri().getPath()), "UTF-8"));
            bufferedWriter.write(str.replaceAll("<boiler>.*</boiler>\r\n", "").replaceAll("<[^<]*>", ""));
            bufferedWriter.close();
        } catch (FileNotFoundException e) {
            LOGGER.error(e.getMessage());
        } catch (UnsupportedEncodingException e2) {
            LOGGER.error(e2.getMessage());
        } catch (IOException e3) {
            LOGGER.error(e3.getMessage());
        }
    }

    public static Boolean XMLExporter(Path path, String str, String str2, String str3, String str4, String str5, String str6, int i, String str7, String str8, String str9, String str10, String str11, ArrayList<String> arrayList, ArrayList<String[]> arrayList2, String[] strArr, String str12, String str13, double d, String str14) {
        String replaceAll;
        String str15;
        String str16 = "";
        boolean z = false;
        int i2 = 0;
        while (true) {
            if (i2 >= mimetypes.length) {
                break;
            }
            if (str.equals(mimetypes[i2])) {
                z = true;
                break;
            }
            i2++;
        }
        if (!z) {
            return false;
        }
        if (str.contains("application/pdf")) {
            replaceAll = str6.replaceAll("<text>", "").replaceAll("</text>", "");
        } else {
            String[] split = str6.split("\n");
            for (int i3 = 0; i3 < split.length; i3++) {
                if (!split[i3].contains("<boiler") && !split[i3].contains("</boiler>")) {
                    str16 = String.valueOf(str16) + split[i3] + "\n";
                }
            }
            replaceAll = str16.replaceAll("<text>", "").replaceAll("</text>", "").replaceAll("<text type.*>", "");
        }
        StringTokenizer stringTokenizer = new StringTokenizer(replaceAll);
        LOGGER.debug(str3);
        LOGGER.debug(Integer.valueOf(stringTokenizer.countTokens()));
        if (stringTokenizer.countTokens() < MIN_TOKENS_NUMBER) {
            return false;
        }
        String str17 = "";
        String checkLang = checkLang(replaceAll.toLowerCase());
        String[] split2 = str4.split(text_cc_separ);
        boolean z2 = false;
        int length = split2.length;
        int i4 = 0;
        while (true) {
            if (i4 >= length) {
                break;
            }
            if (checkLang.equals(split2[i4])) {
                z2 = true;
                break;
            }
            i4++;
        }
        if (!z2) {
            return false;
        }
        String num = Integer.toString(i);
        if (str.contains("application/pdf")) {
            str15 = String.valueOf(num) + ".pdf";
            str5 = str6;
        } else {
            str15 = String.valueOf(num) + ".html";
        }
        Path path2 = new Path(TypeSelector.FileType.FILE, "", FilenameUtils.concat(path.toUri().getPath(), String.valueOf(num) + ".xml"));
        Path path3 = new Path(path, str15);
        if (str.contains("application/pdf")) {
            try {
                ReadResources.copy(str14, FilenameUtils.concat(path.toUri().getPath(), String.valueOf(num) + ".pdf"));
                new File(str14).delete();
            } catch (IOException e) {
                LOGGER.info("source PDF file is not stored.");
                e.printStackTrace();
            }
        } else {
            try {
                OutputStreamWriter outputStreamWriter = new OutputStreamWriter(new FileOutputStream(path3.toUri().getPath()), "UTF-8");
                outputStreamWriter.write(str5);
                outputStreamWriter.close();
            } catch (FileNotFoundException e2) {
                e2.printStackTrace();
            } catch (UnsupportedEncodingException e3) {
                e3.printStackTrace();
            } catch (IOException e4) {
                e4.printStackTrace();
            }
        }
        int i5 = 1;
        XMLOutputFactory2 xMLOutputFactory2 = (XMLOutputFactory2) XMLOutputFactory2.newInstance();
        XMLStreamWriter xMLStreamWriter = null;
        XMLStreamWriter xMLStreamWriter2 = null;
        OutputStreamWriter outputStreamWriter2 = null;
        try {
            outputStreamWriter2 = new OutputStreamWriter(new FileOutputStream(path2.toUri().getPath()), "UTF-8");
        } catch (FileNotFoundException e5) {
            e5.printStackTrace();
        } catch (UnsupportedEncodingException e6) {
            e6.printStackTrace();
        }
        try {
            try {
                XMLStreamWriter2 xMLStreamWriter22 = (XMLStreamWriter2) xMLOutputFactory2.createXMLStreamWriter(outputStreamWriter2);
                XMLStreamWriter2 xMLStreamWriter23 = (XMLStreamWriter2) Proxy.newProxyInstance(XMLStreamWriter2.class.getClassLoader(), new Class[]{XMLStreamWriter2.class}, new PrettyPrintHandler(xMLStreamWriter22));
                xMLStreamWriter23.writeStartDocument();
                xMLStreamWriter23.writeStartElement("cesDoc");
                xMLStreamWriter23.writeAttribute("version", cesDocVersion);
                xMLStreamWriter23.writeAttribute("xmlns:xlink", cesNameSpace);
                xMLStreamWriter23.writeAttribute("xmlns", cesNameSpace1);
                xMLStreamWriter23.writeAttribute("xmlns:xsi", cesNameSpace2);
                createHeader(xMLStreamWriter23, str3, str7, str8, str9, checkLang, str2, str10, arrayList, path3.getName(), str, str11, str12, str13, d);
                xMLStreamWriter23.writeStartElement("text");
                xMLStreamWriter23.writeStartElement(TagConstants.BODY_ACTION);
                try {
                    for (String str18 : str6.split("\n")) {
                        if (str18.length() != 0 && str18.toCharArray()[0] != 160) {
                            xMLStreamWriter23.writeStartElement("p");
                            xMLStreamWriter23.writeAttribute("id", "p" + i5);
                            if (str18.substring(0, 7).equals("<boiler")) {
                                if (str18.substring(0, 8).equals("<boiler>")) {
                                    xMLStreamWriter23.writeAttribute("crawlinfo", "boilerplate");
                                    str18 = str18.substring(8, str18.length() - 9);
                                } else if (str18.substring(0, 15).equals("<boiler type='t")) {
                                    xMLStreamWriter23.writeAttribute("crawlinfo", "boilerplate");
                                    xMLStreamWriter23.writeAttribute("type", "title");
                                    str18 = str18.substring(21, str18.length() - 9);
                                } else if (str18.substring(0, 15).equals("<boiler type='h")) {
                                    xMLStreamWriter23.writeAttribute("crawlinfo", "boilerplate");
                                    xMLStreamWriter23.writeAttribute("type", "heading");
                                    str18 = str18.substring(23, str18.length() - 9);
                                } else if (str18.substring(0, 15).equals("<boiler type='l")) {
                                    xMLStreamWriter23.writeAttribute("crawlinfo", "boilerplate");
                                    xMLStreamWriter23.writeAttribute("type", "listitem");
                                    str18 = str18.substring(24, str18.length() - 9);
                                }
                            } else if (str18.substring(0, 5).equals("<text")) {
                                if (str18.substring(0, 6).equals("<text>")) {
                                    str18 = str18.substring(6, str18.length() - 7);
                                    if (!countWords(str18, MIN_TOKENS_PER_PARAGRAPH).booleanValue()) {
                                        xMLStreamWriter23.writeAttribute("crawlinfo", "ooi-length");
                                    } else if (!str4.isEmpty()) {
                                        str17 = checkLang(str18.toLowerCase());
                                        if (!str17.equals(checkLang)) {
                                            xMLStreamWriter23.writeAttribute("crawlinfo", "ooi-lang");
                                        } else if (findWords(str18, strArr).booleanValue()) {
                                            xMLStreamWriter23.writeAttribute("crawlinfo", "ooi-neg");
                                        } else {
                                            String findTopicTerms = findTopicTerms(str18, arrayList2, checkLang, topicTermsAll);
                                            if (!findTopicTerms.isEmpty()) {
                                                xMLStreamWriter23.writeAttribute("topic", findTopicTerms);
                                            }
                                        }
                                    }
                                } else if (str18.substring(0, 13).equals("<text type='t")) {
                                    str18 = str18.substring(19, str18.length() - 7);
                                    if (!countWords(str18, MIN_TOKENS_PER_PARAGRAPH).booleanValue()) {
                                        xMLStreamWriter23.writeAttribute("crawlinfo", "ooi-length");
                                        xMLStreamWriter23.writeAttribute("type", "title");
                                    } else if (!str4.isEmpty()) {
                                        str17 = checkLang(str18.toLowerCase());
                                        if (!str17.equals(checkLang)) {
                                            xMLStreamWriter23.writeAttribute("crawlinfo", "ooi-lang");
                                            xMLStreamWriter23.writeAttribute("type", "title");
                                        } else if (findWords(str18, strArr).booleanValue()) {
                                            xMLStreamWriter23.writeAttribute("crawlinfo", "ooi-neg");
                                            xMLStreamWriter23.writeAttribute("type", "title");
                                        } else {
                                            String findTopicTerms2 = findTopicTerms(str18, arrayList2, checkLang, topicTermsAll);
                                            xMLStreamWriter23.writeAttribute("type", "title");
                                            if (!findTopicTerms2.isEmpty()) {
                                                xMLStreamWriter23.writeAttribute("topic", findTopicTerms2);
                                            }
                                        }
                                    }
                                } else if (str18.substring(0, 13).equals("<text type='l")) {
                                    str18 = str18.substring(22, str18.length() - 7);
                                    if (!countWords(str18, MIN_TOKENS_PER_PARAGRAPH).booleanValue()) {
                                        xMLStreamWriter23.writeAttribute("crawlinfo", "ooi-length");
                                        xMLStreamWriter23.writeAttribute("type", "listitem");
                                    } else if (!str4.isEmpty()) {
                                        str17 = checkLang(str18.toLowerCase());
                                        if (!str17.equals(checkLang)) {
                                            xMLStreamWriter23.writeAttribute("crawlinfo", "ooi-lang");
                                            xMLStreamWriter23.writeAttribute("type", "listitem");
                                        } else if (findWords(str18, strArr).booleanValue()) {
                                            xMLStreamWriter23.writeAttribute("crawlinfo", "ooi-neg");
                                            xMLStreamWriter23.writeAttribute("type", "listitem");
                                        } else {
                                            String findTopicTerms3 = findTopicTerms(str18, arrayList2, checkLang, topicTermsAll);
                                            xMLStreamWriter23.writeAttribute("type", "listitem");
                                            if (!findTopicTerms3.isEmpty()) {
                                                xMLStreamWriter23.writeAttribute("topic", findTopicTerms3);
                                            }
                                        }
                                    }
                                } else if (str18.substring(0, 13).equals("<text type='h")) {
                                    str18 = str18.substring(21, str18.length() - 7);
                                    if (!countWords(str18, MIN_TOKENS_PER_PARAGRAPH).booleanValue()) {
                                        xMLStreamWriter23.writeAttribute("crawlinfo", "ooi-length");
                                        xMLStreamWriter23.writeAttribute("type", "heading");
                                    } else if (!str4.isEmpty()) {
                                        str17 = checkLang(str18.toLowerCase());
                                        if (!str17.equals(checkLang)) {
                                            xMLStreamWriter23.writeAttribute("crawlinfo", "ooi-lang");
                                            xMLStreamWriter23.writeAttribute("type", "heading");
                                        } else if (findWords(str18, strArr).booleanValue()) {
                                            xMLStreamWriter23.writeAttribute("crawlinfo", "ooi-neg");
                                            xMLStreamWriter23.writeAttribute("type", "heading");
                                        } else {
                                            String findTopicTerms4 = findTopicTerms(str18, arrayList2, checkLang, topicTermsAll);
                                            xMLStreamWriter23.writeAttribute("type", "heading");
                                            if (!findTopicTerms4.isEmpty()) {
                                                xMLStreamWriter23.writeAttribute("topic", findTopicTerms4);
                                            }
                                        }
                                    }
                                }
                            } else if (str18.trim().length() > 1) {
                                if (countWords(str18, MIN_TOKENS_PER_PARAGRAPH).booleanValue()) {
                                    try {
                                        Detector create = DetectorFactory.create();
                                        create.append(str18);
                                        str17 = create.detect();
                                    } catch (LangDetectException e7) {
                                    }
                                    if (!str17.equals(checkLang)) {
                                        xMLStreamWriter23.writeAttribute("type", "lang");
                                    } else if (findWords(str18, strArr).booleanValue()) {
                                        xMLStreamWriter23.writeAttribute("type", CrawlConfig.CONTENT_SUBDIR_NAME);
                                    } else if (arrayList2 != null) {
                                        String[] strArr2 = new String[1];
                                        ArrayList<String> arrayList3 = new ArrayList<>();
                                        try {
                                            arrayList3 = TopicTools.analyze(str18, str17);
                                        } catch (IOException e8) {
                                            e8.printStackTrace();
                                        }
                                        String str19 = "";
                                        Iterator<String> it = arrayList3.iterator();
                                        while (it.hasNext()) {
                                            str19 = str19.concat(ShingleFilter.TOKEN_SEPARATOR + it.next());
                                        }
                                        String trim = str19.trim();
                                        Boolean bool = false;
                                        int i6 = 0;
                                        while (true) {
                                            if (i6 >= arrayList2.size()) {
                                                break;
                                            }
                                            if (Pattern.compile(ShingleFilter.TOKEN_SEPARATOR + arrayList2.get(i6)[1] + ShingleFilter.TOKEN_SEPARATOR).matcher(ShingleFilter.TOKEN_SEPARATOR + trim + ShingleFilter.TOKEN_SEPARATOR).find()) {
                                                bool = true;
                                                break;
                                            }
                                            i6++;
                                        }
                                        if (!bool.booleanValue()) {
                                            xMLStreamWriter23.writeAttribute("type", "terms");
                                        }
                                    }
                                } else {
                                    xMLStreamWriter23.writeAttribute("type", "length");
                                }
                            }
                            xMLStreamWriter23.writeCharacters(str18);
                            xMLStreamWriter23.writeEndElement();
                            i5++;
                        }
                    }
                    xMLStreamWriter23.writeEndElement();
                    xMLStreamWriter23.writeEndElement();
                    xMLStreamWriter23.writeEndElement();
                    xMLStreamWriter23.flush();
                    try {
                        xMLStreamWriter23.close();
                        xMLStreamWriter22.close();
                        outputStreamWriter2.close();
                    } catch (IOException e9) {
                        LOGGER.error(e9.getMessage());
                    } catch (XMLStreamException e10) {
                        LOGGER.error(e10.getMessage());
                    }
                    xmlFiles.add(new File(path2.toUri()));
                    if (applyOfflineXSLT) {
                        File file = new File(path2.toUri());
                        File file2 = new File(String.valueOf(FilenameUtils.removeExtension(file.getAbsolutePath())) + ".xml.html");
                        try {
                            xslTransformer.transform(file, file2);
                        } catch (TransformerException e11) {
                            e11.printStackTrace();
                            LOGGER.warn("Could not transform " + file.getAbsolutePath() + " to " + file2.getAbsolutePath());
                        }
                    }
                    return true;
                } catch (Exception e12) {
                    LOGGER.info("Could not write file with id " + num);
                    try {
                        xMLStreamWriter23.close();
                        xMLStreamWriter22.close();
                        outputStreamWriter2.close();
                    } catch (IOException e13) {
                        LOGGER.error(e13.getMessage());
                    } catch (XMLStreamException e14) {
                        LOGGER.error(e14.getMessage());
                    }
                    return false;
                }
            } catch (XMLStreamException e15) {
                LOGGER.error("Could not write XML " + path2);
                LOGGER.error(e15.getMessage());
                return false;
            }
        } finally {
            try {
                xMLStreamWriter2.close();
                xMLStreamWriter.close();
                outputStreamWriter2.close();
            } catch (IOException e16) {
                LOGGER.error(e16.getMessage());
            } catch (XMLStreamException e17) {
                LOGGER.error(e17.getMessage());
            }
        }
    }

    private static String checkLang(String str) {
        String str2 = "";
        try {
            Detector create = DetectorFactory.create();
            create.append(str);
            str2 = create.detect();
        } catch (LangDetectException e) {
        }
        return str2;
    }

    public static String findTopicTerms(String str, ArrayList<String[]> arrayList, String str2, ArrayList<String> arrayList2) {
        String str3 = "";
        if (arrayList == null || str2.isEmpty()) {
            return str3;
        }
        boolean z = false;
        String[] split = language.split(text_cc_separ);
        int i = 0;
        while (true) {
            if (i >= split.length) {
                break;
            }
            if (split[i].equals(str2)) {
                z = true;
                break;
            }
            i++;
        }
        if (!z) {
            return str3;
        }
        String[] strArr = new String[1];
        ArrayList<String> arrayList3 = new ArrayList<>();
        try {
            arrayList3 = analyze(str, str2);
        } catch (IOException e) {
            e.printStackTrace();
        }
        String str4 = "";
        Iterator<String> it = arrayList3.iterator();
        while (it.hasNext()) {
            str4 = String.valueOf(str4) + ShingleFilter.TOKEN_SEPARATOR + it.next();
        }
        String trim = str4.trim();
        for (int i2 = 0; i2 < arrayList.size(); i2++) {
            String[] strArr2 = arrayList.get(i2);
            double parseDouble = Double.parseDouble(strArr2[0]);
            String str5 = strArr2[1];
            if (strArr2[3].equals(str2)) {
                if (Pattern.compile(ShingleFilter.TOKEN_SEPARATOR + str5 + ShingleFilter.TOKEN_SEPARATOR).matcher(ShingleFilter.TOKEN_SEPARATOR + trim + ShingleFilter.TOKEN_SEPARATOR).find() & (parseDouble > 0.0d)) {
                    str3 = String.valueOf(str3) + text_cc_separ + strArr2[4];
                }
            }
        }
        if (!str3.isEmpty()) {
            str3 = str3.substring(1);
        }
        return str3;
    }

    public static ArrayList<String> analyze(String str, String str2) throws IOException {
        ArrayList<String> arrayList = new ArrayList<>();
        if (str2.equals("lt")) {
            arrayList = LithuanianAnalyzer.analyze(str);
        } else {
            try {
                analyzer = analyzerFactory.getAnalyzer(str2);
                TokenStream tokenStream = analyzer.tokenStream("contents", new StringReader(str));
                CharTermAttribute charTermAttribute = (CharTermAttribute) tokenStream.addAttribute(CharTermAttribute.class);
                while (tokenStream.incrementToken()) {
                    arrayList.add(charTermAttribute.toString());
                }
            } catch (Exception e) {
                e.printStackTrace();
                return null;
            }
        }
        return arrayList;
    }

    public static Boolean countWords(String str, int i) {
        Boolean bool = true;
        if (new StringTokenizer(str).countTokens() < i) {
            bool = false;
        }
        return bool;
    }

    public static Boolean findWords(String str, String[] strArr) {
        if (strArr == null) {
            return false;
        }
        Boolean bool = false;
        int i = 0;
        while (true) {
            if (i >= strArr.length) {
                break;
            }
            if (str.indexOf(strArr[i]) > 0) {
                bool = true;
                break;
            }
            i++;
        }
        return bool;
    }

    private static void createHeader(XMLStreamWriter2 xMLStreamWriter2, String str, String str2, String str3, String str4, String str5, String str6, String str7, ArrayList<String> arrayList, String str8, String str9, String str10, String str11, String str12, double d) throws XMLStreamException {
        xMLStreamWriter2.writeStartElement("cesHeader");
        xMLStreamWriter2.writeAttribute("version", cesDocVersion);
        xMLStreamWriter2.writeStartElement("fileDesc");
        xMLStreamWriter2.writeStartElement("titleStmt");
        xMLStreamWriter2.writeStartElement("title");
        xMLStreamWriter2.writeCharacters(str6.toString());
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeStartElement("respStmt");
        xMLStreamWriter2.writeStartElement("resp");
        xMLStreamWriter2.writeStartElement("type");
        xMLStreamWriter2.writeCharacters("Crawling and normalization");
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeStartElement("name");
        xMLStreamWriter2.writeCharacters("ILSP");
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeStartElement("publicationStmt");
        xMLStreamWriter2.writeStartElement("distributor");
        xMLStreamWriter2.writeCharacters(String.valueOf(researchProject) + " project");
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeStartElement("eAddress");
        xMLStreamWriter2.writeAttribute("type", "web");
        xMLStreamWriter2.writeCharacters("project_website");
        xMLStreamWriter2.writeEndElement();
        if (StringUtils.isEmpty(str11)) {
            xMLStreamWriter2.writeStartElement("availability");
            xMLStreamWriter2.writeCharacters("Under review");
            xMLStreamWriter2.writeEndElement();
        }
        xMLStreamWriter2.writeStartElement("pubDate");
        xMLStreamWriter2.writeCharacters(year);
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeStartElement("sourceDesc");
        xMLStreamWriter2.writeStartElement("biblStruct");
        xMLStreamWriter2.writeStartElement("monogr");
        xMLStreamWriter2.writeStartElement("title");
        xMLStreamWriter2.writeCharacters(str6.toString());
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeStartElement("author");
        if (str3 == null) {
            str3 = "";
        }
        xMLStreamWriter2.writeCharacters(str3);
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeStartElement("imprint");
        xMLStreamWriter2.writeStartElement(DublinCore.FORMAT);
        xMLStreamWriter2.writeCharacters(str9);
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeStartElement(DublinCore.PUBLISHER);
        if (str4 == null) {
            str4 = "";
        }
        xMLStreamWriter2.writeCharacters(str4);
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeStartElement("pubDate");
        xMLStreamWriter2.writeCharacters(str2);
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeStartElement("eAddress");
        xMLStreamWriter2.writeCharacters(str);
        xMLStreamWriter2.writeEndElement();
        if (!StringUtils.isEmpty(str11)) {
            xMLStreamWriter2.writeStartElement("license");
            String[] split = str11.split(text_cc_separ);
            if (split.length > 1) {
                xMLStreamWriter2.writeAttribute("target", split[1]);
            }
            xMLStreamWriter2.writeCharacters(split[0]);
            xMLStreamWriter2.writeEndElement();
        }
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeStartElement("profileDesc");
        xMLStreamWriter2.writeStartElement("langUsage");
        xMLStreamWriter2.writeStartElement("language");
        xMLStreamWriter2.writeAttribute("iso639", str5 != null ? str5 : "");
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeStartElement("textClass");
        if (arrayList != null) {
            xMLStreamWriter2.writeStartElement("keywords");
            Iterator<String> it = arrayList.iterator();
            while (it.hasNext()) {
                String next = it.next();
                xMLStreamWriter2.writeStartElement("keyTerm");
                xMLStreamWriter2.writeCharacters(next);
                xMLStreamWriter2.writeEndElement();
            }
            xMLStreamWriter2.writeEndElement();
        }
        xMLStreamWriter2.writeStartElement(ClientCookie.DOMAIN_ATTR);
        if (d > 0.0d) {
            xMLStreamWriter2.writeAttribute("confidence", Double.toString(d));
        }
        if (str7 == null) {
            str7 = "";
        }
        xMLStreamWriter2.writeCharacters(str7);
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeStartElement("subdomain");
        xMLStreamWriter2.writeCharacters(str10 != null ? str10 : "");
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeStartElement("genre");
        xMLStreamWriter2.writeCharacters(str12 != null ? str12 : "");
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeStartElement(DublinCore.SUBJECT);
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeStartElement("annotations");
        xMLStreamWriter2.writeStartElement(JamXmlElements.ANNOTATION);
        xMLStreamWriter2.writeCharacters(str8);
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeEndElement();
        xMLStreamWriter2.writeEndElement();
    }

    public void setMIN_TOKENS_PER_PARAGRAPH(int i) {
        MIN_TOKENS_PER_PARAGRAPH = i;
    }

    public void setMIN_TOKENS_NUMBER(int i) {
        MIN_TOKENS_NUMBER = i;
    }

    public static String getCrawlDirName() {
        return crawlDirName;
    }

    public void setCrawlDirName(String str) {
        crawlDirName = str;
    }

    public void setLanguage(String str) {
        language = str;
    }

    public static String getLanguage() {
        return language;
    }

    public static String getTopic() {
        return topic;
    }

    public void setTopic(String str) {
        topic = str;
    }

    public static String getNegWordsFile() {
        return negWordsFile;
    }

    public void setNegWordsFile(String str) {
        negWordsFile = str;
    }

    public void setOutputFile(String str) {
        outputFile = str;
    }

    public void setOutputFileHTML(String str) {
        outputFileHTML = str;
    }

    public void setOutputDir(String str) {
        outputDir = str;
    }

    public void setTextExport(boolean z) {
        textExport = z;
    }

    public void setStyleExport(boolean z) {
        cesdoc = z;
    }

    public void setHTMLOutput(boolean z) {
        html = z;
    }

    public void setAcceptedMimeTypes(String[] strArr) {
        mimetypes = strArr;
    }

    public void setTargetedDomain(String str) {
        targeteddomain = str;
    }

    public void setGenres(URL url) {
        genres = url;
    }

    public static URL getGenres() {
        return genres;
    }

    public static String getResearchProject() {
        return researchProject;
    }

    public static void setResearchProject(String str) {
        researchProject = str;
    }

    public boolean isApplyOfflineXSLT() {
        return applyOfflineXSLT;
    }

    public void setApplyOfflineXSLT(boolean z) {
        applyOfflineXSLT = z;
        if (z) {
            try {
                xslTransformer = new XSLTransformer();
            } catch (MalformedURLException e) {
                e.printStackTrace();
            } catch (IOException e2) {
                e2.printStackTrace();
            } catch (TransformerConfigurationException e3) {
                e3.printStackTrace();
            }
            if (xslTransformer == null) {
                LOGGER.warn("Cannot initialize xslTransformer. Will not transorm XML files using xslt.");
                applyOfflineXSLT = false;
            }
        }
    }
}
