package gr.ilsp.fmc.utils;

import java.util.HashMap;
import org.apache.commons.cli.HelpFormatter;
import org.apache.lucene.analysis.shingle.ShingleFilter;

/* loaded from: input_file:gr/ilsp/fmc/utils/ContentNormalizer.class */
public class ContentNormalizer {
    private static final HashMap<String, String> invalidChars = new HashMap<String, String>() { // from class: gr.ilsp.fmc.utils.ContentNormalizer.1
        private static final long serialVersionUID = -7208860988679686271L;

        {
            put("\\uFEFF", ShingleFilter.TOKEN_SEPARATOR);
            put("\\u00a0", ShingleFilter.TOKEN_SEPARATOR);
            put("\\u200E", ShingleFilter.TOKEN_SEPARATOR);
            put("\\u0097", HelpFormatter.DEFAULT_OPT_PREFIX);
            put("\\u0092", "’");
            put("\\u0313", "’");
            put("\\u0094", "”");
            put("\\u0093", "“");
            put("\\u0095", "•");
            put("\\u0096", HelpFormatter.DEFAULT_OPT_PREFIX);
            put("\\u0081", ShingleFilter.TOKEN_SEPARATOR);
            put("\\u202f", ShingleFilter.TOKEN_SEPARATOR);
            put("\\u2206", "Δ");
            put("\\u02bc", "’");
            put("\\u003e", ">");
            put("\\uFFFD", "");
            put("\\uF0D8", "");
            put("\\uF02D", "");
            put("\\uF0FC", "");
            put("\\uF034", "");
            put("\\uF076", "");
            put("\\uF0BC", "");
            put("\\uF06C", "");
            put("\\uF0E8", "");
            put("\\uF0B7", "");
            put("\\uF0A7", "");
            put("\\uF0FB", "");
            put("\\uF06E", "");
            put("\\uF0F1", "");
            put("\\uF075", "");
            put("\\u2126", "Ω");
            put("\\u25B6", "►");
            put("\\u200F", ShingleFilter.TOKEN_SEPARATOR);
            put("\\u0080", "€");
            put("\\u2002", ShingleFilter.TOKEN_SEPARATOR);
            put("\\u2003", ShingleFilter.TOKEN_SEPARATOR);
            put("\\u2004", ShingleFilter.TOKEN_SEPARATOR);
            put("\\u2005", ShingleFilter.TOKEN_SEPARATOR);
            put("\\u2006", ShingleFilter.TOKEN_SEPARATOR);
            put("\\u2007", ShingleFilter.TOKEN_SEPARATOR);
            put("\\u2008", ShingleFilter.TOKEN_SEPARATOR);
            put("\\u2009", ShingleFilter.TOKEN_SEPARATOR);
            put("\\u200A", ShingleFilter.TOKEN_SEPARATOR);
            put("\\uF0B7", "•");
            put("\\u2043", "•");
            put("\\uC2B3", "•");
            put("\\u225C", "Δ");
            put("\\u2206", "Δ");
            put("\\u002D", HelpFormatter.DEFAULT_OPT_PREFIX);
            put("\\u2012", HelpFormatter.DEFAULT_OPT_PREFIX);
            put("\\u2013", HelpFormatter.DEFAULT_OPT_PREFIX);
            put("\\u2014", HelpFormatter.DEFAULT_OPT_PREFIX);
            put("\\u2015", HelpFormatter.DEFAULT_OPT_PREFIX);
            put("\\u2E17", HelpFormatter.DEFAULT_OPT_PREFIX);
            put("\\u2E3A", HelpFormatter.DEFAULT_OPT_PREFIX);
            put("\\u2E3B", HelpFormatter.DEFAULT_OPT_PREFIX);
            put("\\u301C", HelpFormatter.DEFAULT_OPT_PREFIX);
            put("\\u3030", HelpFormatter.DEFAULT_OPT_PREFIX);
            put("\\u30A0", HelpFormatter.DEFAULT_OPT_PREFIX);
            put("\\uFE31", HelpFormatter.DEFAULT_OPT_PREFIX);
            put("\\uFE32", HelpFormatter.DEFAULT_OPT_PREFIX);
            put("\\uFE58", HelpFormatter.DEFAULT_OPT_PREFIX);
            put("\\uFE63", HelpFormatter.DEFAULT_OPT_PREFIX);
            put("\\uFF0D", HelpFormatter.DEFAULT_OPT_PREFIX);
        }
    };
    private static final String XML10PATTERN = "[^\t\r\n -\ud7ff\ue000-���-��]";

    public static String normalizeText(String str) {
        for (String str2 : invalidChars.keySet()) {
            str = str.replaceAll(str2, invalidChars.get(str2));
        }
        return str.replaceAll("\t", ShingleFilter.TOKEN_SEPARATOR).replaceAll("(\\s){2,}", ShingleFilter.TOKEN_SEPARATOR).replaceAll("<text> ", "<text>").replaceAll("<boiler> ", "<boiler>").replaceAll(" </text>", "</text>").replaceAll(" </boiler> ", "</boiler>").replaceAll("<text></text>", "").replaceAll("<boiler></boiler>", "").replaceAll("( \n)", "\n").replaceAll("(\n){2,}", "\n");
    }

    public static String normalizePdfText(String str) {
        for (String str2 : invalidChars.keySet()) {
            str = str.replaceAll(str2, invalidChars.get(str2));
        }
        return splitParagraphs("\n" + str.replaceAll(XML10PATTERN, "").replaceAll("( \n)", "\n").replaceAll("(\n){3,}", "\n"));
    }

    private static String splitParagraphs(String str) {
        String[] split = str.split("\n");
        int[] iArr = new int[split.length];
        for (int i = 0; i < split.length - 1; i++) {
            iArr[i] = split[i].length();
        }
        for (int i2 = 0; i2 < split.length - 1; i2++) {
            iArr[i2] = split[i2].length();
        }
        return str;
    }
}
