package bixo.parser;

import bixo.config.ParserPolicy;
import bixo.datum.FetchedDatum;
import bixo.datum.ParsedDatum;
import bixo.utils.IoUtils;
import java.io.ByteArrayInputStream;
import java.io.Serializable;
import java.util.Locale;
import java.util.concurrent.FutureTask;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.TimeoutException;
import org.apache.log4j.Logger;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.HtmlMapper;
import org.apache.tika.parser.html.IdentityHtmlMapper;

/* loaded from: input_file:bixo/parser/SimpleParser.class */
public class SimpleParser extends BaseParser {
    private static final Logger LOGGER = Logger.getLogger(SimpleParser.class);
    private boolean _extractLanguage;
    protected BaseContentExtractor _contentExtractor;
    protected BaseLinkExtractor _linkExtractor;
    protected ParseContext _parseContext;
    private transient Parser _parser;

    /* loaded from: input_file:bixo/parser/SimpleParser$FixedIdentityHtmlMapper.class */
    private static class FixedIdentityHtmlMapper extends IdentityHtmlMapper implements Serializable {
        public static final HtmlMapper INSTANCE = new FixedIdentityHtmlMapper();

        private FixedIdentityHtmlMapper() {
        }

        @Override // org.apache.tika.parser.html.IdentityHtmlMapper, org.apache.tika.parser.html.HtmlMapper
        public String mapSafeElement(String str) {
            return str.toLowerCase(Locale.ENGLISH);
        }
    }

    public SimpleParser() {
        this(new ParserPolicy());
    }

    public SimpleParser(ParserPolicy parserPolicy) {
        this(new SimpleContentExtractor(), new SimpleLinkExtractor(), parserPolicy, (ParseContext) null);
    }

    public SimpleParser(BaseContentExtractor baseContentExtractor, BaseLinkExtractor baseLinkExtractor, ParserPolicy parserPolicy) {
        this(baseContentExtractor, baseLinkExtractor, parserPolicy, (ParseContext) null);
    }

    public SimpleParser(ParserPolicy parserPolicy, boolean z) {
        this(z ? new HtmlContentExtractor() : new SimpleContentExtractor(), z ? NullLinkExtractor.INSTANCE : new SimpleLinkExtractor(), parserPolicy, z);
    }

    public SimpleParser(BaseContentExtractor baseContentExtractor, BaseLinkExtractor baseLinkExtractor, ParserPolicy parserPolicy, boolean z) {
        super(parserPolicy);
        this._extractLanguage = true;
        this._contentExtractor = baseContentExtractor;
        this._linkExtractor = baseLinkExtractor;
        if (z) {
            this._parseContext = new ParseContext();
            this._parseContext.set(HtmlMapper.class, FixedIdentityHtmlMapper.INSTANCE);
        }
    }

    public SimpleParser(BaseContentExtractor baseContentExtractor, BaseLinkExtractor baseLinkExtractor, ParserPolicy parserPolicy, ParseContext parseContext) {
        super(parserPolicy);
        this._extractLanguage = true;
        this._contentExtractor = baseContentExtractor;
        this._linkExtractor = baseLinkExtractor;
        this._parseContext = parseContext;
    }

    protected synchronized void init() {
        if (this._parser == null) {
            this._parser = getTikaParser();
        }
        this._contentExtractor.reset();
        this._linkExtractor.setLinkTags(getParserPolicy().getLinkTags());
        this._linkExtractor.setLinkAttributeTypes(getParserPolicy().getLinkAttributeTypes());
        this._linkExtractor.reset();
    }

    public Parser getTikaParser() {
        return new AutoDetectParser();
    }

    public void setExtractLanguage(boolean z) {
        this._extractLanguage = z;
    }

    public boolean isExtractLanguage() {
        return this._extractLanguage;
    }

    /* JADX WARN: Finally extract failed */
    @Override // bixo.parser.BaseParser
    public ParsedDatum parse(FetchedDatum fetchedDatum) throws Exception {
        init();
        if (LOGGER.isTraceEnabled()) {
            LOGGER.trace(String.format("Parsing %s", fetchedDatum.getUrl()));
        }
        Metadata metadata = new Metadata();
        metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, fetchedDatum.getUrl());
        metadata.add("Content-Type", fetchedDatum.getContentType());
        metadata.add("Content-Language", getLanguage(fetchedDatum, getCharset(fetchedDatum)));
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(fetchedDatum.getContentBytes(), 0, fetchedDatum.getContentLength());
        try {
            metadata.add("Content-Location", getContentLocation(fetchedDatum).toExternalForm());
            FutureTask futureTask = new FutureTask(new TikaCallable(this._parser, this._contentExtractor, this._linkExtractor, byteArrayInputStream, metadata, isExtractLanguage(), this._parseContext));
            Thread thread = new Thread(futureTask);
            thread.start();
            try {
                try {
                    ParsedDatum parsedDatum = (ParsedDatum) futureTask.get(getParserPolicy().getMaxParseDuration(), TimeUnit.MILLISECONDS);
                    parsedDatum.setHostAddress(fetchedDatum.getHostAddress());
                    parsedDatum.setPayload(fetchedDatum.getPayload());
                    IoUtils.safeClose(byteArrayInputStream);
                    return parsedDatum;
                } catch (Throwable th) {
                    throw th;
                }
            } catch (TimeoutException e) {
                futureTask.cancel(true);
                thread.interrupt();
                throw e;
            }
        } catch (Throwable th2) {
            IoUtils.safeClose(byteArrayInputStream);
            throw th2;
        }
    }
}
