package gr.ilsp.fmc.parser;

import bixo.robots.BaseRobotRules;
import bixo.robots.BaseRobotsParser;
import bixo.robots.SimpleRobotRules;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URL;
import java.net.URLDecoder;
import java.util.HashMap;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.cli.HelpFormatter;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.fa.PersianAnalyzer;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.hsqldb.DatabaseURL;
import org.mortbay.util.URIUtil;

/* loaded from: input_file:gr/ilsp/fmc/parser/RobotRulesParser.class */
public class RobotRulesParser extends BaseRobotsParser {
    private static final Logger LOGGER = Logger.getLogger(RobotRulesParser.class);
    private static Map<String, RobotDirective> DIRECTIVE_PREFIX = new HashMap();
    private static final Pattern COLON_DIRECTIVE_DELIMITER;
    private static final Pattern BLANK_DIRECTIVE_DELIMITER;
    private static final Pattern DIRECTIVE_SUFFIX_PATTERN;
    private static final Pattern SIMPLE_HTML_PATTERN;
    private static final Pattern USER_AGENT_PATTERN;
    private static final int MAX_WARNINGS = 5;
    private static final long MAX_CRAWL_DELAY = 300000;
    private int _numWarnings;
    private static /* synthetic */ int[] $SWITCH_TABLE$gr$ilsp$fmc$parser$RobotRulesParser$RobotDirective;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:gr/ilsp/fmc/parser/RobotRulesParser$ParseState.class */
    public static class ParseState {
        private boolean _matchedRealName;
        private boolean _matchedWildcard;
        private boolean _addingRules;
        private boolean _finishedAgentFields;
        private String _url;
        private String _targetName;
        private SimpleRobotRules _curRules = new SimpleRobotRules();

        public ParseState(String str, String str2) {
            this._url = str;
            this._targetName = str2;
        }

        public String getTargetName() {
            return this._targetName;
        }

        public boolean isMatchedRealName() {
            return this._matchedRealName;
        }

        public void setMatchedRealName(boolean z) {
            this._matchedRealName = z;
        }

        public boolean isMatchedWildcard() {
            return this._matchedWildcard;
        }

        public void setMatchedWildcard(boolean z) {
            this._matchedWildcard = z;
        }

        public boolean isAddingRules() {
            return this._addingRules;
        }

        public void setAddingRules(boolean z) {
            this._addingRules = z;
        }

        public boolean isFinishedAgentFields() {
            return this._finishedAgentFields;
        }

        public void setFinishedAgentFields(boolean z) {
            this._finishedAgentFields = z;
        }

        public void clearRules() {
            this._curRules.clearRules();
        }

        public void addRule(String str, boolean z) {
            this._curRules.addRule(str, z);
        }

        public void setCrawlDelay(long j) {
            this._curRules.setCrawlDelay(j);
        }

        public SimpleRobotRules getRobotRules() {
            return this._curRules;
        }

        public String getUrl() {
            return this._url;
        }

        public void addSitemap(String str) {
            this._curRules.addSitemap(str);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:gr/ilsp/fmc/parser/RobotRulesParser$RobotDirective.class */
    public enum RobotDirective {
        USER_AGENT,
        DISALLOW,
        ALLOW,
        CRAWL_DELAY,
        SITEMAP,
        HOST,
        NO_INDEX,
        ACAP_(true, false),
        REQUEST_RATE,
        VISIT_TIME,
        ROBOT_VERSION,
        COMMENT,
        HTTP,
        UNKNOWN(false, true),
        MISSING(false, true);

        private boolean _prefix;
        private boolean _special;

        RobotDirective() {
            this._prefix = false;
            this._special = false;
        }

        RobotDirective(boolean z, boolean z2) {
            this._prefix = z;
            this._special = z2;
        }

        public boolean isSpecial() {
            return this._special;
        }

        public boolean isPrefix() {
            return this._prefix;
        }

        /* renamed from: values, reason: to resolve conflict with enum method */
        public static RobotDirective[] valuesCustom() {
            RobotDirective[] valuesCustom = values();
            int length = valuesCustom.length;
            RobotDirective[] robotDirectiveArr = new RobotDirective[length];
            System.arraycopy(valuesCustom, 0, robotDirectiveArr, 0, length);
            return robotDirectiveArr;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:gr/ilsp/fmc/parser/RobotRulesParser$RobotToken.class */
    public static class RobotToken {
        private RobotDirective _directive;
        private String _data;

        public RobotToken(RobotDirective robotDirective, String str) {
            this._directive = robotDirective;
            this._data = str;
        }

        public RobotDirective getDirective() {
            return this._directive;
        }

        public String getData() {
            return this._data;
        }
    }

    static {
        for (RobotDirective robotDirective : RobotDirective.valuesCustom()) {
            if (!robotDirective.isSpecial()) {
                DIRECTIVE_PREFIX.put(robotDirective.name().toLowerCase().replaceAll("_", HelpFormatter.DEFAULT_OPT_PREFIX), robotDirective);
            }
        }
        DIRECTIVE_PREFIX.put("useragent", RobotDirective.USER_AGENT);
        DIRECTIVE_PREFIX.put("useg-agent", RobotDirective.USER_AGENT);
        DIRECTIVE_PREFIX.put("ser-agent", RobotDirective.USER_AGENT);
        DIRECTIVE_PREFIX.put("desallow", RobotDirective.DISALLOW);
        DIRECTIVE_PREFIX.put("dissalow", RobotDirective.DISALLOW);
        DIRECTIVE_PREFIX.put("dssalow", RobotDirective.DISALLOW);
        DIRECTIVE_PREFIX.put("dsallow", RobotDirective.DISALLOW);
        DIRECTIVE_PREFIX.put("crawl delay", RobotDirective.CRAWL_DELAY);
        COLON_DIRECTIVE_DELIMITER = Pattern.compile("[ \t]*:[ \t]*(.*)");
        BLANK_DIRECTIVE_DELIMITER = Pattern.compile("[ \t]+(.*)");
        DIRECTIVE_SUFFIX_PATTERN = Pattern.compile("[^: \t]+(.*)");
        SIMPLE_HTML_PATTERN = Pattern.compile("(?is)<(html|head|body)\\s*>");
        USER_AGENT_PATTERN = Pattern.compile("(?i)user-agent:");
    }

    private static RobotToken tokenize(String str) {
        for (String str2 : DIRECTIVE_PREFIX.keySet()) {
            int length = str2.length();
            if (str.startsWith(str2)) {
                RobotDirective robotDirective = DIRECTIVE_PREFIX.get(str2);
                String substring = str.substring(length);
                if (robotDirective.isPrefix()) {
                    Matcher matcher = DIRECTIVE_SUFFIX_PATTERN.matcher(substring);
                    if (matcher.matches()) {
                        substring = matcher.group(1);
                    } else {
                        continue;
                    }
                }
                Matcher matcher2 = COLON_DIRECTIVE_DELIMITER.matcher(substring);
                if (!matcher2.matches()) {
                    matcher2 = BLANK_DIRECTIVE_DELIMITER.matcher(substring);
                }
                if (matcher2.matches()) {
                    return new RobotToken(robotDirective, matcher2.group(1).trim());
                }
            }
        }
        return COLON_DIRECTIVE_DELIMITER.matcher(str).matches() ? new RobotToken(RobotDirective.UNKNOWN, str) : new RobotToken(RobotDirective.MISSING, str);
    }

    @Override // bixo.robots.BaseRobotsParser
    public BaseRobotRules failedFetch(int i) {
        SimpleRobotRules simpleRobotRules;
        if (i >= 200 && i < 300) {
            throw new IllegalStateException("Can't use status code constructor with 2xx response");
        }
        if (i >= 300 && i < 400) {
            simpleRobotRules = new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_NONE);
            simpleRobotRules.setDeferVisits(true);
        } else if (i < 400 || i >= 500) {
            simpleRobotRules = new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_NONE);
            simpleRobotRules.setDeferVisits(true);
        } else {
            simpleRobotRules = new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
        }
        return simpleRobotRules;
    }

    @Override // bixo.robots.BaseRobotsParser
    public BaseRobotRules parseContent(String str, byte[] bArr, String str2, String str3) {
        this._numWarnings = 0;
        if (bArr == null || bArr.length == 0) {
            return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
        }
        int length = bArr.length;
        int i = 0;
        String str4 = "us-ascii";
        if (length >= 3 && bArr[0] == -17 && bArr[1] == -69 && bArr[2] == -65) {
            i = 3;
            length -= 3;
            str4 = "UTF-8";
        }
        try {
            String str5 = new String(bArr, i, length, str4);
            boolean z = str2 != null && str2.toLowerCase().startsWith("text/html");
            boolean z2 = false;
            if (z || SIMPLE_HTML_PATTERN.matcher(str5).find()) {
                if (!USER_AGENT_PATTERN.matcher(str5).find()) {
                    LOGGER.trace("Found non-robots.txt HTML file: " + str);
                    return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
                }
                if (z) {
                    LOGGER.debug("HTML content type returned for robots.txt file: " + str);
                } else {
                    LOGGER.debug("Found HTML in robots.txt file: " + str);
                }
                z2 = true;
            }
            StringTokenizer stringTokenizer = new StringTokenizer(str5, "\n\r\u0085\u2028\u2029");
            ParseState parseState = new ParseState(str, str3.toLowerCase());
            boolean z3 = true;
            while (z3 && stringTokenizer.hasMoreTokens()) {
                String nextToken = stringTokenizer.nextToken();
                if (z2) {
                    nextToken = nextToken.replaceAll("<[^>]+>", "");
                }
                int indexOf = nextToken.indexOf(PersianAnalyzer.STOPWORDS_COMMENT);
                if (indexOf >= 0) {
                    nextToken = nextToken.substring(0, indexOf);
                }
                String lowerCase = nextToken.trim().toLowerCase();
                if (lowerCase.length() != 0) {
                    RobotToken robotToken = tokenize(lowerCase);
                    switch ($SWITCH_TABLE$gr$ilsp$fmc$parser$RobotRulesParser$RobotDirective()[robotToken.getDirective().ordinal()]) {
                        case 1:
                            z3 = handleUserAgent(parseState, robotToken);
                            break;
                        case 2:
                            z3 = handleDisallow(parseState, robotToken);
                            break;
                        case 3:
                            z3 = handleAllow(parseState, robotToken);
                            break;
                        case 4:
                            z3 = handleCrawlDelay(parseState, robotToken);
                            break;
                        case 5:
                            z3 = handleSitemap(parseState, robotToken);
                            break;
                        case 13:
                            z3 = handleHttp(parseState, robotToken);
                            break;
                        case 14:
                            reportWarning("Unknown directive in robots.txt file: " + lowerCase, str);
                            parseState.setFinishedAgentFields(true);
                            break;
                        case 15:
                            reportWarning(String.format("Unknown line in robots.txt file (size %d): %s", Integer.valueOf(bArr.length), lowerCase), str);
                            parseState.setFinishedAgentFields(true);
                            break;
                    }
                }
            }
            SimpleRobotRules robotRules = parseState.getRobotRules();
            if (robotRules.getCrawlDelay() <= MAX_CRAWL_DELAY) {
                return robotRules;
            }
            LOGGER.debug("Crawl delay exceeds max value - so disallowing all URLs: " + str);
            return new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_NONE);
        } catch (UnsupportedEncodingException e) {
            throw new RuntimeException("Impossible unsupported encoding exception for " + str4);
        }
    }

    private void reportWarning(String str, String str2) {
        this._numWarnings++;
        if (this._numWarnings == 1) {
            LOGGER.warn("Problem processing robots.txt for " + str2);
        }
        if (this._numWarnings < 5) {
            LOGGER.warn("\t" + str);
        }
    }

    private boolean handleUserAgent(ParseState parseState, RobotToken robotToken) {
        if (parseState.isMatchedRealName()) {
            return !parseState.isFinishedAgentFields();
        }
        if (parseState.isFinishedAgentFields()) {
            parseState.setFinishedAgentFields(false);
            parseState.setAddingRules(false);
        }
        String[] split = parseState.getTargetName().split(ShingleFilter.TOKEN_SEPARATOR);
        for (String str : robotToken.getData().split("[ \t,]")) {
            if (!str.equals("*") || parseState.isMatchedWildcard()) {
                int length = split.length;
                int i = 0;
                while (true) {
                    if (i < length) {
                        if (split[i].startsWith(str)) {
                            parseState.setMatchedRealName(true);
                            parseState.setAddingRules(true);
                            parseState.clearRules();
                            break;
                        }
                        i++;
                    }
                }
            } else {
                parseState.setMatchedWildcard(true);
                parseState.setAddingRules(true);
            }
        }
        return true;
    }

    private boolean handleDisallow(ParseState parseState, RobotToken robotToken) {
        parseState.setFinishedAgentFields(true);
        if (!parseState.isAddingRules()) {
            return true;
        }
        String data = robotToken.getData();
        try {
            data = URLDecoder.decode(data, "UTF-8");
            if (data.length() == 0) {
                parseState.clearRules();
            } else {
                parseState.addRule(data, false);
            }
            return true;
        } catch (Exception e) {
            reportWarning("Error parsing robots rules - can't decode path: " + data, parseState.getUrl());
            return true;
        }
    }

    private boolean handleAllow(ParseState parseState, RobotToken robotToken) {
        parseState.setFinishedAgentFields(true);
        if (!parseState.isAddingRules()) {
            return true;
        }
        String data = robotToken.getData();
        try {
            data = URLDecoder.decode(data, "UTF-8");
        } catch (Exception e) {
            reportWarning("Error parsing robots rules - can't decode path: " + data, parseState.getUrl());
        }
        if (data.length() == 0) {
            parseState.clearRules();
            return true;
        }
        parseState.addRule(data, true);
        return true;
    }

    private boolean handleCrawlDelay(ParseState parseState, RobotToken robotToken) {
        parseState.setFinishedAgentFields(true);
        if (!parseState.isAddingRules()) {
            return true;
        }
        String data = robotToken.getData();
        if (data.length() <= 0) {
            return true;
        }
        try {
            if (data.indexOf(46) != -1) {
                parseState.setCrawlDelay(Math.round(Double.parseDouble(data) * 1000.0d));
            } else {
                parseState.setCrawlDelay(Integer.parseInt(data) * 1000);
            }
            return true;
        } catch (Exception e) {
            reportWarning("Error parsing robots rules - can't decode crawl delay: " + data, parseState.getUrl());
            return true;
        }
    }

    private boolean handleSitemap(ParseState parseState, RobotToken robotToken) {
        String host;
        String data = robotToken.getData();
        try {
            String host2 = new URL(parseState._url).getHost();
            if (!data.contains("http")) {
                data = DatabaseURL.S_HTTP + host2 + data;
            }
            String host3 = new URL(data).getHost();
            if (host3 == null || host3.length() <= 0 || (host = new URI(data).getHost()) == null || host.length() <= 0) {
                return true;
            }
            parseState.addSitemap(data);
            return true;
        } catch (Exception e) {
            reportWarning("Invalid URL with sitemap directive: " + data, parseState.getUrl());
            return true;
        }
    }

    private boolean handleHttp(ParseState parseState, RobotToken robotToken) {
        String data = robotToken.getData();
        if (data.contains("sitemap")) {
            return handleSitemap(parseState, new RobotToken(RobotDirective.SITEMAP, URIUtil.HTTP_COLON + robotToken.getData()));
        }
        reportWarning("Found raw non-sitemap URL: http:" + data, parseState.getUrl());
        return true;
    }

    public int getNumWarnings() {
        return this._numWarnings;
    }

    static /* synthetic */ int[] $SWITCH_TABLE$gr$ilsp$fmc$parser$RobotRulesParser$RobotDirective() {
        int[] iArr = $SWITCH_TABLE$gr$ilsp$fmc$parser$RobotRulesParser$RobotDirective;
        if (iArr != null) {
            return iArr;
        }
        int[] iArr2 = new int[RobotDirective.valuesCustom().length];
        try {
            iArr2[RobotDirective.ACAP_.ordinal()] = 8;
        } catch (NoSuchFieldError unused) {
        }
        try {
            iArr2[RobotDirective.ALLOW.ordinal()] = 3;
        } catch (NoSuchFieldError unused2) {
        }
        try {
            iArr2[RobotDirective.COMMENT.ordinal()] = 12;
        } catch (NoSuchFieldError unused3) {
        }
        try {
            iArr2[RobotDirective.CRAWL_DELAY.ordinal()] = 4;
        } catch (NoSuchFieldError unused4) {
        }
        try {
            iArr2[RobotDirective.DISALLOW.ordinal()] = 2;
        } catch (NoSuchFieldError unused5) {
        }
        try {
            iArr2[RobotDirective.HOST.ordinal()] = 6;
        } catch (NoSuchFieldError unused6) {
        }
        try {
            iArr2[RobotDirective.HTTP.ordinal()] = 13;
        } catch (NoSuchFieldError unused7) {
        }
        try {
            iArr2[RobotDirective.MISSING.ordinal()] = 15;
        } catch (NoSuchFieldError unused8) {
        }
        try {
            iArr2[RobotDirective.NO_INDEX.ordinal()] = 7;
        } catch (NoSuchFieldError unused9) {
        }
        try {
            iArr2[RobotDirective.REQUEST_RATE.ordinal()] = 9;
        } catch (NoSuchFieldError unused10) {
        }
        try {
            iArr2[RobotDirective.ROBOT_VERSION.ordinal()] = 11;
        } catch (NoSuchFieldError unused11) {
        }
        try {
            iArr2[RobotDirective.SITEMAP.ordinal()] = 5;
        } catch (NoSuchFieldError unused12) {
        }
        try {
            iArr2[RobotDirective.UNKNOWN.ordinal()] = 14;
        } catch (NoSuchFieldError unused13) {
        }
        try {
            iArr2[RobotDirective.USER_AGENT.ordinal()] = 1;
        } catch (NoSuchFieldError unused14) {
        }
        try {
            iArr2[RobotDirective.VISIT_TIME.ordinal()] = 10;
        } catch (NoSuchFieldError unused15) {
        }
        $SWITCH_TABLE$gr$ilsp$fmc$parser$RobotRulesParser$RobotDirective = iArr2;
        return iArr2;
    }
}
