package gr.ilsp.fmc.parser;

import bixo.parser.BaseContentExtractor;
import gr.ilsp.boilerpipe.extractors.NumWordsRulesExtractor;
import gr.ilsp.fmc.datums.ExtendedParsedDatum;
import gr.ilsp.fmc.utils.ContentNormalizer;
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.Callable;
import org.apache.log4j.Logger;
import org.apache.tika.language.ProfilingHandler;
import org.apache.tika.metadata.CreativeCommons;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.DefaultHtmlMapper;
import org.apache.tika.parser.html.HtmlMapper;
import org.apache.tika.sax.TeeContentHandler;

/* loaded from: input_file:gr/ilsp/fmc/parser/TikaCallableParser.class */
public class TikaCallableParser implements Callable<ExtendedParsedDatum> {
    private static final String LICENSES_STR = "/licenses/";
    private static final String HTTP_PROTOCOL = "http";
    private static final String HTTPS_PROTOCOL = "https";
    private static final String CREATIVECOMMONS_ORG_STR = "creativecommons.org";
    private static final String EUROPE_ORG_STR = "europa.eu";
    private static final String CC_pattern = "Creative Commons";
    private static final String default_CCurl_in_text = "http://creativecommons.org/licenses/by/3.0/";
    private static final String default_CCcomment_in_url = "Distributed under a Creative Commons license";
    private static final String default_Europecomment_in_url = "©European Union, 1995-2014. Reuse is authorised, provided the source is acknowledged.";
    private static final String default_CCcomment_in_text = "Distributed under a Creative Commons license";
    private static final String text_cc_separ = ";";
    private static final Logger LOGGER = Logger.getLogger(TikaCallableParser.class);
    private Parser _parser;
    private BaseContentExtractor _contentExtractor;
    private InputStream _input;
    private Metadata _metadata;
    private boolean _extractLanguage;
    private boolean _keepBoiler;

    public TikaCallableParser(Parser parser, BaseContentExtractor baseContentExtractor, InputStream inputStream, Metadata metadata) {
        this(parser, baseContentExtractor, inputStream, metadata, true, false);
    }

    public TikaCallableParser(Parser parser, BaseContentExtractor baseContentExtractor, InputStream inputStream, Metadata metadata, boolean z, boolean z2) {
        this._keepBoiler = false;
        this._parser = parser;
        this._contentExtractor = baseContentExtractor;
        this._input = inputStream;
        this._metadata = metadata;
        this._extractLanguage = z;
        this._keepBoiler = z2;
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // java.util.concurrent.Callable
    public ExtendedParsedDatum call() throws Exception {
        try {
            TeeContentHandler teeContentHandler = this._extractLanguage ? new TeeContentHandler(this._contentExtractor, new ProfilingHandler()) : new TeeContentHandler(this._contentExtractor);
            String str = this._metadata.get("Content-Encoding");
            this._parser.parse(this._input, teeContentHandler, this._metadata, makeParseContext());
            if (str != null && str != this._metadata.get("Content-Encoding")) {
                this._metadata.set("Content-Encoding", str);
            }
            this._input.reset();
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(this._input, this._metadata.get("Content-Encoding")));
            String text = this._keepBoiler ? NumWordsRulesExtractor.INSTANCE.getText((Reader) bufferedReader, true) : de.l3s.boilerpipe.extractors.NumWordsRulesExtractor.INSTANCE.getText(bufferedReader);
            bufferedReader.close();
            String normalizeText = ContentNormalizer.normalizeText(text);
            ExtendedOutlink[] links = ExtendedLinksExtractor.getLinks(this._input, this._metadata);
            String str2 = this._metadata.get("Content-Location");
            boolean z = false;
            if (str2.contains(EUROPE_ORG_STR)) {
                this._metadata.set(CreativeCommons.LICENSE_URL, default_Europecomment_in_url);
                z = true;
            }
            if (!z) {
                for (ExtendedOutlink extendedOutlink : links) {
                    try {
                        URL url = new URL(extendedOutlink.getToUrl().toString());
                        if (("http".equalsIgnoreCase(url.getProtocol()) | "https".equalsIgnoreCase(url.getProtocol())) && CREATIVECOMMONS_ORG_STR.equalsIgnoreCase(url.getHost()) && url.getPath() != null && url.getPath().startsWith(LICENSES_STR) && url.getPath().length() > LICENSES_STR.length()) {
                            this._metadata.set(CreativeCommons.LICENSE_URL, "Distributed under a Creative Commons license;" + url.toString());
                            z = true;
                            break;
                        }
                    } catch (Exception e) {
                        LOGGER.debug("reached");
                    }
                }
                if (!z && normalizeText.contains(CC_pattern)) {
                    this._metadata.set(CreativeCommons.LICENSE_URL, "Distributed under a Creative Commons license;http://creativecommons.org/licenses/by/3.0/");
                }
            }
            LOGGER.debug(String.valueOf(str2) + this._metadata.get(CreativeCommons.LICENSE_URL));
            return new ExtendedParsedDatum(this._metadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY), null, normalizeText, "", this._metadata.get("title"), links, makeMap(this._metadata));
        } catch (Exception e2) {
            throw e2;
        } catch (NoSuchMethodError e3) {
            throw new RuntimeException("Attempting to use excluded parser");
        } catch (Throwable th) {
            throw new RuntimeException("Serious shut-down error thrown from Tika", th);
        }
    }

    private ParseContext makeParseContext() {
        ParseContext parseContext = new ParseContext();
        parseContext.set(HtmlMapper.class, DefaultHtmlMapper.INSTANCE);
        return parseContext;
    }

    private static Map<String, String> makeMap(Metadata metadata) {
        HashMap hashMap = new HashMap();
        for (String str : metadata.names()) {
            hashMap.put(str, metadata.get(str));
        }
        return hashMap;
    }
}
