package gr.ilsp.fmc.parser;

import bixo.config.ParserPolicy;
import bixo.datum.FetchedDatum;
import bixo.parser.BaseContentExtractor;
import bixo.parser.SimpleContentExtractor;
import bixo.utils.HttpUtils;
import bixo.utils.IoUtils;
import gr.ilsp.fmc.datums.ExtendedParsedDatum;
import java.io.ByteArrayInputStream;
import java.io.Serializable;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.concurrent.Callable;
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.apache.log4j.Logger;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.Parser;
import org.apache.tika.utils.CharsetUtils;

/* loaded from: input_file:gr/ilsp/fmc/parser/SimpleNoLinksParser.class */
public class SimpleNoLinksParser implements Serializable, Callable<ExtendedParsedDatum> {
    private static final long serialVersionUID = -6644451796914124629L;
    private static final Logger LOGGER = Logger.getLogger(SimpleNoLinksParser.class);
    private boolean _extractLanguage;
    protected BaseContentExtractor _contentExtractor;
    private transient Parser _parser;
    private ParserPolicy _policy;
    private FetchedDatum _datum;
    private boolean _keepBoiler;
    private String _storedir_path;

    public SimpleNoLinksParser(FetchedDatum fetchedDatum) {
        this();
        this._datum = fetchedDatum;
    }

    public SimpleNoLinksParser() {
        this(new ParserPolicy());
    }

    public SimpleNoLinksParser(boolean z, String str) {
        this(new ParserPolicy());
        this._keepBoiler = z;
        this._storedir_path = str;
    }

    public SimpleNoLinksParser(ParserPolicy parserPolicy) {
        this(new SimpleContentExtractor(), parserPolicy);
    }

    public SimpleNoLinksParser(BaseContentExtractor baseContentExtractor, ParserPolicy parserPolicy) {
        this._extractLanguage = true;
        this._keepBoiler = false;
        this._policy = parserPolicy;
        this._contentExtractor = baseContentExtractor;
    }

    protected synchronized void init() {
        if (this._parser == null) {
            this._parser = getTikaParser();
        }
        this._contentExtractor.reset();
    }

    public Parser getTikaParser() {
        return new AutoDetectParser();
    }

    public void setExtractLanguage(boolean z) {
        this._extractLanguage = z;
    }

    public boolean isExtractLanguage() {
        return this._extractLanguage;
    }

    /* JADX WARN: Finally extract failed */
    public ExtendedParsedDatum parse(FetchedDatum fetchedDatum) throws Exception {
        Callable tikaCallableParser;
        init();
        if (LOGGER.isTraceEnabled()) {
            LOGGER.trace(String.format("Parsing %s", fetchedDatum.getUrl()));
        }
        Metadata metadata = new Metadata();
        metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, fetchedDatum.getUrl());
        metadata.add("Content-Type", fetchedDatum.getContentType());
        String charset = getCharset(fetchedDatum);
        metadata.add("Content-Language", getLanguage(fetchedDatum, charset));
        metadata.add("Content-Encoding", charset);
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(fetchedDatum.getContentBytes(), 0, fetchedDatum.getContentLength());
        metadata.add("Content-Length", Integer.toString(fetchedDatum.getContentLength()));
        try {
            metadata.add("Content-Location", getContentLocation(fetchedDatum).toExternalForm());
            if (metadata.get("Content-Type").equals("application/pdf")) {
                this._parser = null;
                LOGGER.info("pdf reached");
                tikaCallableParser = new PdfboxCallableParser(this._parser, this._contentExtractor, byteArrayInputStream, metadata, isExtractLanguage(), this._keepBoiler, this._storedir_path);
            } else {
                tikaCallableParser = new TikaCallableParser(this._parser, this._contentExtractor, byteArrayInputStream, metadata, isExtractLanguage(), this._keepBoiler);
            }
            FutureTask futureTask = new FutureTask(tikaCallableParser);
            Thread thread = new Thread(futureTask);
            thread.start();
            try {
                try {
                    ExtendedParsedDatum extendedParsedDatum = (ExtendedParsedDatum) futureTask.get(getParserPolicy().getMaxParseDuration(), TimeUnit.MILLISECONDS);
                    extendedParsedDatum.setHostAddress(fetchedDatum.getHostAddress());
                    extendedParsedDatum.setPayload(fetchedDatum.getPayload());
                    return extendedParsedDatum;
                } catch (TimeoutException e) {
                    futureTask.cancel(true);
                    thread.interrupt();
                    throw e;
                }
            } catch (Throwable th) {
                throw th;
            }
        } finally {
            IoUtils.safeClose(byteArrayInputStream);
        }
    }

    protected URL getContentLocation(FetchedDatum fetchedDatum) throws MalformedURLException {
        URL url = new URL(fetchedDatum.getFetchedUrl());
        String first = fetchedDatum.getHeaders().getFirst("Content-Location");
        if (first != null) {
            url = new URL(url, first);
        }
        return url;
    }

    protected String getCharset(FetchedDatum fetchedDatum) {
        return CharsetUtils.clean(HttpUtils.getCharsetFromContentType(fetchedDatum.getContentType()));
    }

    protected String getLanguage(FetchedDatum fetchedDatum, String str) {
        return fetchedDatum.getHeaders().getFirst("Content-Language");
    }

    public ParserPolicy getParserPolicy() {
        return this._policy;
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // java.util.concurrent.Callable
    public ExtendedParsedDatum call() throws Exception {
        return parse(this._datum);
    }
}
