package gr.ilsp.fmc.classifier;

import com.cybozu.labs.langdetect.Detector;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
import gr.ilsp.fmc.datums.ClassifierDatum;
import gr.ilsp.fmc.datums.ExtendedParsedDatum;
import gr.ilsp.fmc.utils.TopicTools;
import java.io.IOException;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.Map;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.hadoop.util.StringUtils;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.tika.metadata.DublinCore;

/* loaded from: input_file:gr/ilsp/fmc/classifier/Classifier.class */
public class Classifier implements Serializable {
    private static final Logger LOGGER = Logger.getLogger(Classifier.class);
    private double TITLE_WEIGHT = 10.0d;
    private double KEYWORDS_WEIGHT = 4.0d;
    private double META_WEIGHT = 2.0d;
    private double CONTENT_WEIGHT = 1.0d;
    private String _targetLanguage;
    private int _minTokensNumber;
    private String[] _classes;
    private String[] _targetlangKeys;
    private ArrayList<String[]> _topic;
    private double _absthres;
    private double _relthres;
    private static int _min_uniq_terms;
    private boolean _keepBoiler;
    private int _max_depth;
    private static String langIdentified;

    public Classifier(String[] strArr, String str, String[] strArr2, ArrayList<String[]> arrayList, double d, double d2, boolean z, int i, int i2, int i3) {
        this._keepBoiler = false;
        this._targetLanguage = str;
        this._classes = strArr2;
        this._topic = arrayList;
        this._absthres = d;
        this._relthres = d2;
        this._keepBoiler = z;
        _min_uniq_terms = i;
        this._max_depth = i2;
        this._targetlangKeys = strArr;
        this._minTokensNumber = i3;
    }

    public ClassifierDatum classify(ExtendedParsedDatum extendedParsedDatum) {
        ClassifierDatum classifyText;
        String title = extendedParsedDatum.getTitle();
        Map<String, String> parsedMeta = extendedParsedDatum.getParsedMeta();
        String str = "";
        String str2 = "";
        String lowerCase = extendedParsedDatum.getParsedText().toLowerCase();
        if (this._keepBoiler) {
            lowerCase = cleanContent(lowerCase);
        }
        int countTokens = new StringTokenizer(lowerCase).countTokens();
        if (countTokens < this._minTokensNumber) {
            return null;
        }
        String url = extendedParsedDatum.getUrl();
        for (Map.Entry<String, String> entry : parsedMeta.entrySet()) {
            if (entry.getKey().equals("keywords")) {
                str = entry.getValue();
            }
            if (entry.getKey().equals(DublinCore.DESCRIPTION)) {
                str2 = entry.getValue();
            }
        }
        if (this._targetLanguage != null) {
            try {
                Detector create = DetectorFactory.create();
                create.append(lowerCase);
                langIdentified = create.detect();
            } catch (LangDetectException e) {
            }
            String[] split = this._targetLanguage.split(";");
            boolean z = false;
            int length = split.length;
            int i = 0;
            while (true) {
                if (i >= length) {
                    break;
                }
                if (langIdentified.equals(split[i])) {
                    z = true;
                    break;
                }
                i++;
            }
            if (!z) {
                return null;
            }
        }
        if (this._topic == null) {
            return new ClassifierDatum(url, new String[0], new Double[0][0], Double.valueOf(0.0d), Double.valueOf(0.0d), countTokens);
        }
        if (title == null) {
            title = "";
        }
        Double[][] rankText = rankText(title, this.TITLE_WEIGHT, this._topic, this._classes, false);
        Double[][] rankText2 = rankText(str, this.KEYWORDS_WEIGHT, this._topic, this._classes, false);
        Double[][] rankText3 = rankText(str2, this.META_WEIGHT, this._topic, this._classes, false);
        Double[][] rankText4 = rankText(lowerCase, this.CONTENT_WEIGHT, this._topic, this._classes, true);
        if (rankText4 == null || (classifyText = classifyText(rankText, rankText2, rankText3, rankText4, this._absthres, this._relthres, this._relthres, this._classes, url, countTokens)) == null) {
            return null;
        }
        double doubleValue = rankText4[rankText4.length - 1][0].doubleValue();
        double doubleValue2 = rankText4[rankText4.length - 1][1].doubleValue();
        if (this._absthres == 0.0d) {
            this._relthres = -0.1d;
        }
        if (doubleValue < this._absthres || doubleValue2 <= this._relthres) {
            return null;
        }
        return classifyText;
    }

    /* JADX WARN: Multi-variable type inference failed */
    public static ClassifierDatum classifyText(Double[][] dArr, Double[][] dArr2, Double[][] dArr3, Double[][] dArr4, double d, double d2, double d3, String[] strArr, String str, int i) {
        Double valueOf = Double.valueOf(0.0d);
        Double valueOf2 = Double.valueOf(0.0d);
        for (int i2 = 0; i2 < dArr.length; i2++) {
            Double[] dArr5 = dArr[i2];
            dArr5[0] = Double.valueOf(dArr5[0].doubleValue() + dArr2[i2][0].doubleValue() + dArr3[i2][0].doubleValue() + dArr4[i2][0].doubleValue());
            Double[] dArr6 = dArr[i2];
            dArr6[1] = Double.valueOf(dArr6[1].doubleValue() + dArr2[i2][1].doubleValue() + dArr3[i2][1].doubleValue() + dArr4[i2][1].doubleValue());
        }
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        if (dArr[dArr.length - 1][0].doubleValue() < d || dArr[dArr.length - 1][1].doubleValue() < d2) {
            return null;
        }
        for (int i3 = 0; i3 < dArr.length - 1; i3++) {
            if (dArr[i3][1].doubleValue() / dArr[dArr.length - 1][1].doubleValue() >= d3) {
                arrayList.add(strArr[i3]);
                valueOf = Double.valueOf(valueOf.doubleValue() + dArr[i3][0].doubleValue());
                valueOf2 = Double.valueOf(valueOf2.doubleValue() + dArr[i3][1].doubleValue());
                arrayList2.add(new Double[]{new Double[]{dArr[i3][0], dArr[i3][1]}});
            }
        }
        String[] strArr2 = new String[arrayList.size()];
        for (int i4 = 0; i4 < arrayList.size(); i4++) {
            strArr2[i4] = (String) arrayList.get(i4);
        }
        Double[][] dArr7 = new Double[arrayList2.size()][2];
        for (int i5 = 0; i5 < arrayList2.size(); i5++) {
            Double[][] dArr8 = (Double[][]) arrayList2.get(i5);
            dArr7[i5][0] = dArr8[0][0];
            dArr7[i5][1] = dArr8[0][1];
        }
        return new ClassifierDatum(str, strArr2, dArr7, valueOf, Double.valueOf(Math.round(Double.valueOf(dArr4[dArr4.length - 1][1].doubleValue() / 2.0d).doubleValue() * 1000.0d) / 1000.0d), i);
    }

    public static Double[][] rankText(String str, double d, ArrayList<String[]> arrayList, String[] strArr, boolean z) {
        int i = 0;
        new ArrayList();
        try {
            ArrayList<String> analyze = TopicTools.analyze(str, langIdentified);
            String str2 = "";
            Iterator<String> it = analyze.iterator();
            while (it.hasNext()) {
                str2 = String.valueOf(str2) + ShingleFilter.TOKEN_SEPARATOR + it.next();
            }
            String trim = str2.trim();
            double countTokens = new StringTokenizer(trim).countTokens();
            Double[][] dArr = new Double[strArr.length + 1][2];
            for (int i2 = 0; i2 < strArr.length; i2++) {
                dArr[i2][0] = Double.valueOf(0.0d);
                dArr[i2][1] = Double.valueOf(0.0d);
            }
            dArr[strArr.length][0] = Double.valueOf(0.0d);
            dArr[strArr.length][1] = Double.valueOf(0.0d);
            if (countTokens == 0.0d) {
                return dArr;
            }
            String[] strArr2 = new String[1];
            for (int i3 = 0; i3 < arrayList.size(); i3++) {
                String[] strArr3 = arrayList.get(i3);
                String str3 = strArr3[1];
                String str4 = strArr3[3];
                double d2 = 0.0d;
                double parseDouble = Double.parseDouble(strArr3[0]);
                if (str4.equals(langIdentified)) {
                    Matcher matcher = Pattern.compile(ShingleFilter.TOKEN_SEPARATOR + str3 + ShingleFilter.TOKEN_SEPARATOR).matcher(ShingleFilter.TOKEN_SEPARATOR + trim + ShingleFilter.TOKEN_SEPARATOR);
                    ArrayList arrayList2 = new ArrayList();
                    while (matcher.find()) {
                        arrayList2.add(Integer.toString(matcher.start()));
                        d2 += 1.0d;
                    }
                    if (d2 > 0.0d) {
                        if (parseDouble > 0.0d) {
                            i++;
                        }
                        double d3 = parseDouble * d2;
                        String str5 = strArr3[2];
                        if (str5.contains(";")) {
                            String[] split = str5.split(";");
                            int length = split.length;
                            for (String str6 : split) {
                                int binarySearch = Arrays.binarySearch(strArr, str6);
                                dArr[binarySearch][0] = Double.valueOf(dArr[binarySearch][0].doubleValue() + (d3 / length));
                            }
                        } else {
                            int binarySearch2 = Arrays.binarySearch(strArr, str5);
                            dArr[binarySearch2][0] = Double.valueOf(dArr[binarySearch2][0].doubleValue() + d3);
                        }
                    }
                }
            }
            if (z && i < _min_uniq_terms && _min_uniq_terms > 0) {
                return null;
            }
            for (int i4 = 0; i4 < strArr.length; i4++) {
                dArr[i4][0] = Double.valueOf(dArr[i4][0].doubleValue() * d);
                if (z) {
                    dArr[i4][1] = Double.valueOf(dArr[i4][0].doubleValue() / countTokens);
                }
                dArr[strArr.length][0] = Double.valueOf(dArr[i4][0].doubleValue() + dArr[strArr.length][0].doubleValue());
                dArr[strArr.length][1] = Double.valueOf(dArr[i4][1].doubleValue() + dArr[strArr.length][1].doubleValue());
            }
            return dArr;
        } catch (IOException e) {
            LOGGER.warn(e.getMessage());
            return null;
        }
    }

    public double rankLink(String str, String str2, String str3, double d) {
        String[] split;
        boolean z;
        boolean z2;
        double d2 = 0.0d;
        String trim = str.trim();
        ArrayList<String> arrayList = new ArrayList<>();
        try {
            split = this._targetLanguage.split(";");
            z = false;
            z2 = false;
            int length = split.length;
            int i = 0;
            while (true) {
                if (i >= length) {
                    break;
                }
                if (str3.equals(split[i])) {
                    z = true;
                    break;
                }
                i++;
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        if (trim.isEmpty()) {
            return 0.0d;
        }
        String checkLang = checkLang(trim.toLowerCase());
        int length2 = split.length;
        int i2 = 0;
        while (true) {
            if (i2 >= length2) {
                break;
            }
            if (checkLang.equals(split[i2])) {
                z2 = true;
                break;
            }
            i2++;
        }
        boolean z3 = false;
        if (split.length > 1) {
            int i3 = 0;
            int length3 = split.length;
            int i4 = 0;
            while (true) {
                if (i4 >= length3) {
                    break;
                }
                if (containLangKeys(str2, i3) && (!split[i4].equals(str3))) {
                    z3 = true;
                    break;
                }
                i3++;
                i4++;
            }
            if ((z3) & (d >= this._absthres)) {
                return 0.0d + (10000.0d * this._absthres);
            }
        }
        if (!z || !z2) {
            return 0.0d;
        }
        d2 = ((!checkLang.equals(str3)) & (new StringTokenizer(trim).countTokens() > 3)) & ((d > this._absthres ? 1 : (d == this._absthres ? 0 : -1)) >= 0) ? 0.0d + (10.0d * this._absthres) : 0.0d + this._absthres;
        arrayList = TopicTools.analyze(trim, checkLang);
        String str4 = "";
        Iterator<String> it = arrayList.iterator();
        while (it.hasNext()) {
            str4 = str4.concat(ShingleFilter.TOKEN_SEPARATOR + it.next());
        }
        String trim2 = str4.trim();
        for (int i5 = 0; i5 < this._topic.size(); i5++) {
            String[] strArr = this._topic.get(i5);
            int i6 = 0;
            while (Pattern.compile(ShingleFilter.TOKEN_SEPARATOR + strArr[1] + ShingleFilter.TOKEN_SEPARATOR).matcher(ShingleFilter.TOKEN_SEPARATOR + trim2 + ShingleFilter.TOKEN_SEPARATOR).find()) {
                i6++;
            }
            if (i6 > 0) {
                d2 += Double.parseDouble(strArr[0]) * i6;
            }
        }
        return d2;
    }

    private boolean containLangKeys(String str, int i) {
        boolean z = false;
        if (this._targetlangKeys[i].isEmpty()) {
            return false;
        }
        String[] split = this._targetlangKeys[i].split(StringUtils.COMMA_STR);
        String[] split2 = str.toLowerCase().split(ShingleFilter.TOKEN_SEPARATOR);
        if (split2.length > 10) {
            return false;
        }
        for (String str2 : split) {
            int i2 = 0;
            while (true) {
                if (i2 < split2.length) {
                    if (str2.equals(split2[i2])) {
                        z = true;
                        break;
                    }
                    i2++;
                }
            }
        }
        return z;
    }

    public static String cleanContent(String str) {
        String str2 = "";
        Matcher matcher = Pattern.compile("<text.*>.*</text>").matcher(str);
        while (matcher.find()) {
            str2 = str2.concat(matcher.group().replaceAll(" type='listelem'", "").replaceAll(" type='title'", "").replaceAll(" type='heading'", "").replaceAll("</?text>", ShingleFilter.TOKEN_SEPARATOR));
        }
        return str2;
    }

    public ArrayList<String[]> getTopic() {
        return this._topic;
    }

    public int getMaxDepth() {
        return this._max_depth;
    }

    public static String checkLang(String str) {
        String str2 = "";
        if (str.length() < 5) {
            return str2;
        }
        try {
            Detector create = DetectorFactory.create();
            create.append(str);
            str2 = create.detect();
        } catch (LangDetectException e) {
        }
        return str2;
    }

    public double rankLinkNotopic(String str, String str2, String str3, double d) {
        String trim = str.trim();
        String[] split = this._targetLanguage.split(";");
        boolean z = false;
        boolean z2 = false;
        int length = split.length;
        int i = 0;
        while (true) {
            if (i >= length) {
                break;
            }
            if (str3.equals(split[i])) {
                z = true;
                break;
            }
            i++;
        }
        if (trim.isEmpty()) {
            return 0.0d;
        }
        String checkLang = checkLang(trim.toLowerCase());
        int length2 = split.length;
        int i2 = 0;
        while (true) {
            if (i2 >= length2) {
                break;
            }
            if (checkLang.equals(split[i2])) {
                z2 = true;
                break;
            }
            i2++;
        }
        boolean z3 = false;
        if (split.length > 1) {
            int i3 = 0;
            int length3 = split.length;
            int i4 = 0;
            while (true) {
                if (i4 >= length3) {
                    break;
                }
                if (containLangKeys(str2, i3) && (!split[i4].equals(str3))) {
                    z3 = true;
                    break;
                }
                i3++;
                i4++;
            }
            if ((z3) & (d >= this._absthres)) {
                return 0.0d + 2.0d;
            }
        }
        if (!z || !z2) {
            return 0.0d;
        }
        return ((!checkLang.equals(str3)) & (new StringTokenizer(trim).countTokens() > 3)) & ((d > this._absthres ? 1 : (d == this._absthres ? 0 : -1)) >= 0) ? 0.0d + 1.0d : 0.0d + 1.0d;
    }
}
