package bixo.parser;

import bixo.datum.ParsedDatum;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import org.apache.tika.language.LanguageIdentifier;
import org.apache.tika.language.ProfilingHandler;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.html.DefaultHtmlMapper;
import org.apache.tika.parser.html.HtmlMapper;
import org.apache.tika.sax.TeeContentHandler;

/* loaded from: input_file:bixo/parser/TikaCallable.class */
class TikaCallable implements Callable<ParsedDatum> {
    private static final Logger LOGGER = Logger.getLogger(TikaCallable.class);
    private static final Pattern LANGUAGE_CODE_PATTERN = Pattern.compile("([a-z]{2})([,;-]).*");
    private Parser _parser;
    private BaseContentExtractor _contentExtractor;
    private BaseLinkExtractor _linkExtractor;
    private InputStream _input;
    private Metadata _metadata;
    private boolean _extractLanguage;
    private ParseContext _parseContext;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:bixo/parser/TikaCallable$CustomHtmlMapper.class */
    public static class CustomHtmlMapper extends DefaultHtmlMapper {
        private Set<String> _validTags;
        private Set<String> _validAttributes;

        public CustomHtmlMapper(Set<String> set, Set<String> set2) {
            this._validTags = set;
            this._validAttributes = set2;
        }

        @Override // org.apache.tika.parser.html.DefaultHtmlMapper, org.apache.tika.parser.html.HtmlMapper
        public String mapSafeElement(String str) {
            return this._validTags.contains(str.toLowerCase()) ? str.toLowerCase() : super.mapSafeElement(str);
        }

        @Override // org.apache.tika.parser.html.DefaultHtmlMapper, org.apache.tika.parser.html.HtmlMapper
        public String mapSafeAttribute(String str, String str2) {
            return this._validAttributes.contains(str2) ? str2 : super.mapSafeAttribute(str, str2);
        }
    }

    public TikaCallable(Parser parser, BaseContentExtractor baseContentExtractor, BaseLinkExtractor baseLinkExtractor, InputStream inputStream, Metadata metadata) {
        this(parser, baseContentExtractor, baseLinkExtractor, inputStream, metadata, true);
    }

    public TikaCallable(Parser parser, BaseContentExtractor baseContentExtractor, BaseLinkExtractor baseLinkExtractor, InputStream inputStream, Metadata metadata, boolean z) {
        this(parser, baseContentExtractor, baseLinkExtractor, inputStream, metadata, true, null);
    }

    public TikaCallable(Parser parser, BaseContentExtractor baseContentExtractor, BaseLinkExtractor baseLinkExtractor, InputStream inputStream, Metadata metadata, boolean z, ParseContext parseContext) {
        this._parser = parser;
        this._contentExtractor = baseContentExtractor;
        this._linkExtractor = baseLinkExtractor;
        this._input = inputStream;
        this._metadata = metadata;
        this._extractLanguage = z;
        this._parseContext = parseContext;
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // java.util.concurrent.Callable
    public ParsedDatum call() throws Exception {
        TeeContentHandler teeContentHandler;
        try {
            ProfilingHandler profilingHandler = null;
            if (this._extractLanguage) {
                profilingHandler = new ProfilingHandler();
                teeContentHandler = new TeeContentHandler(this._contentExtractor, this._linkExtractor, profilingHandler);
            } else {
                teeContentHandler = new TeeContentHandler(this._contentExtractor, this._linkExtractor);
            }
            if (this._parseContext == null) {
                this._parseContext = makeParseContext();
            }
            this._parser.parse(this._input, teeContentHandler, this._metadata, this._parseContext);
            return new ParsedDatum(this._metadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY), null, this._contentExtractor.getContent(), this._extractLanguage ? detectLanguage(this._metadata, profilingHandler) : "", this._metadata.get("title"), this._linkExtractor.getLinks(), makeMap(this._metadata));
        } catch (Exception e) {
            throw e;
        } catch (NoSuchMethodError e2) {
            throw new RuntimeException("Attempting to use excluded parser");
        } catch (Throwable th) {
            throw new RuntimeException("Serious shut-down error thrown from Tika", th);
        }
    }

    private ParseContext makeParseContext() {
        ParseContext parseContext = new ParseContext();
        Set<String> linkTags = this._linkExtractor.getLinkTags();
        HtmlMapper htmlMapper = DefaultHtmlMapper.INSTANCE;
        Iterator<String> it = linkTags.iterator();
        while (true) {
            if (!it.hasNext()) {
                break;
            }
            if (htmlMapper.mapSafeElement(it.next()) == null) {
                parseContext.set(HtmlMapper.class, new CustomHtmlMapper(linkTags, this._linkExtractor.getLinkAttributeTypes()));
                break;
            }
        }
        return parseContext;
    }

    private static String detectLanguage(Metadata metadata, ProfilingHandler profilingHandler) {
        String str = null;
        String str2 = metadata.get("language");
        String str3 = metadata.get("Content-Language");
        if (str2 != null) {
            str = str2;
        } else if (str3 != null) {
            str = str3;
        }
        String firstLanguage = getFirstLanguage(str);
        if (firstLanguage == null) {
            LanguageIdentifier language = profilingHandler.getLanguage();
            if (language.isReasonablyCertain()) {
                firstLanguage = language.getLanguage();
                LOGGER.trace("Using language specified by profiling handler: " + firstLanguage);
            } else {
                firstLanguage = "";
            }
        }
        return firstLanguage;
    }

    private static Map<String, String> makeMap(Metadata metadata) {
        HashMap hashMap = new HashMap();
        for (String str : metadata.names()) {
            hashMap.put(str, metadata.get(str));
        }
        return hashMap;
    }

    private static String getFirstLanguage(String str) {
        if (str != null && str.length() > 0 && str.length() > 2) {
            Matcher matcher = LANGUAGE_CODE_PATTERN.matcher(str);
            str = matcher.matches() ? matcher.group(1) : null;
        }
        return str;
    }
}
