package gr.ilsp.fmc.main;

import gr.ilsp.fmc.exporter.SampleExporter;
import gr.ilsp.fmc.utils.FcFileUtils;
import gr.ilsp.fmc.utils.TopicTools;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.StringTokenizer;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.mapred.lib.aggregate.ValueAggregatorDescriptor;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.shingle.ShingleFilter;
import org.apache.xerces.impl.xs.SchemaSymbols;

/* loaded from: input_file:gr/ilsp/fmc/main/DedupMD5.class */
public class DedupMD5 {
    private static File input;
    private static File out_textfile;
    private static final Logger LOGGER = Logger.getLogger(SampleExporter.class);
    private static boolean html = false;
    private static String input_type = "xml";
    private static int MIN_TOKEN_LEN = 3;
    private static float QUANT_RATE = 0.01f;
    private static int QAUNT_DEFAULT = 5;
    private static double inter_thr = 0.7d;
    private static int min_tok_len = 3;
    private static String fs = System.getProperty("file.separator");
    private static String methodtype = "0";

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:gr/ilsp/fmc/main/DedupMD5$TextAttr.class */
    public static class TextAttr {
        public int length;
        public String filename;

        public TextAttr(int i, String str) {
            this.length = i;
            this.filename = str;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:gr/ilsp/fmc/main/DedupMD5$Token.class */
    public static class Token {
        public int cnt;
        public String val;

        public Token(int i, String str) {
            this.cnt = i;
            this.val = str;
        }

        public String toString() {
            return String.valueOf(this.val) + ShingleFilter.TOKEN_SEPARATOR + this.cnt;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:gr/ilsp/fmc/main/DedupMD5$TokenComparator.class */
    public static class TokenComparator implements Comparator<Token> {
        private TokenComparator() {
        }

        @Override // java.util.Comparator
        public int compare(Token token, Token token2) {
            return token2.cnt - token.cnt;
        }

        /* synthetic */ TokenComparator(TokenComparator tokenComparator) {
            this();
        }
    }

    public static void main(String[] strArr) {
        String str = strArr[0];
        String str2 = String.valueOf(str) + fs + strArr[1];
        String str3 = String.valueOf(str) + fs + strArr[2];
        if (strArr.length > 3) {
            methodtype = strArr[3];
        }
        if (strArr.length > 4) {
            inter_thr = Double.parseDouble(strArr[4]);
        }
        if (strArr.length > 5) {
            min_tok_len = Integer.parseInt(strArr[5]);
        }
        if (methodtype.equals(SchemaSymbols.ATTVAL_TRUE_1)) {
            LOGGER.info("Deduplication by using lists and MD5 method.");
            dedup(str, str2, str3, true);
        }
        if (methodtype.equals("2")) {
            LOGGER.info("Deduplication based on common paragraphs.");
            dedupnew(str, str2, str3, true);
        }
        if (methodtype.equals("0")) {
            LOGGER.info("Deduplication by using lists and MD5 method.");
            dedup(str, str2, str3, true);
            LOGGER.info("Deduplication based on common paragraphs.");
            dedupnew(str, str2, str3, true);
        }
        Bitexts.counttokens(String.valueOf(str) + fs + "xml", "xml", "crawlinfo=");
        System.out.println("tokens calculated.");
    }

    public static void dedup(String str, String str2, String str3, boolean z) {
        String str4 = String.valueOf(str) + fs + "xml";
        int indexOf = str4.indexOf(ValueAggregatorDescriptor.TYPE_SEPARATOR);
        if (indexOf < 0 || indexOf < 2) {
            input = new File(str4);
        } else {
            input = new File(str4.substring(indexOf + 2, str4.length()));
        }
        if (!input.exists() || !input.isDirectory()) {
            System.err.println("the directory with the cesdoc files does not exist!!!!!!!!");
            System.exit(64);
        }
        out_textfile = new File(str2);
        if (!out_textfile.exists()) {
            System.err.println("List of cesdoc files does not exist!!!!!!!!");
            System.exit(64);
        }
        if (str3 != null) {
            html = true;
        }
        File file = new File(String.valueOf(input.getParent()) + fs + "xml");
        File[] listFiles = input.listFiles(new FilenameFilter() { // from class: gr.ilsp.fmc.main.DedupMD5.1
            @Override // java.io.FilenameFilter
            public boolean accept(File file2, String str5) {
                return str5.substring(str5.length() - (DedupMD5.input_type.length() + 1)).equals("." + DedupMD5.input_type);
            }
        });
        if (listFiles.length < 2) {
            LOGGER.info("The input list contains less than 2 files.");
            return;
        }
        LOGGER.info(String.valueOf(listFiles.length) + " files will be processed.");
        long currentTimeMillis = System.currentTimeMillis();
        HashMap hashMap = new HashMap();
        String str5 = "";
        int i = 0;
        for (int i2 = 0; i2 < listFiles.length; i2++) {
            String extractTextfromXML_clean = ReadResources.extractTextfromXML_clean(listFiles[i2].getAbsolutePath(), "p", "crawlinfo", false);
            LOGGER.debug(listFiles[i2].getAbsolutePath());
            if (!extractTextfromXML_clean.isEmpty()) {
                String str6 = "";
                for (byte b : calculate(extractTextfromXML_clean)) {
                    str6 = String.valueOf(str6) + ((int) b);
                }
                TextAttr textAttr = new TextAttr(extractTextfromXML_clean.length(), listFiles[i2].getName());
                if (hashMap.containsKey(str6)) {
                    str5 = String.valueOf(str5) + textAttr.filename + "\t\t" + ((TextAttr) hashMap.get(str6)).filename + "\n";
                    if (textAttr.length > ((TextAttr) hashMap.get(str6)).length) {
                        LOGGER.debug("OUT\t" + ((TextAttr) hashMap.get(str6)).filename);
                        hashMap.put(str6, textAttr);
                        String str7 = String.valueOf(input.getPath()) + fs + ((TextAttr) hashMap.get(str6)).filename;
                        FcFileUtils.delete(str7);
                        String replace = str7.replace("." + input_type, ".html");
                        FcFileUtils.delete(replace);
                        FcFileUtils.delete(replace.replace(".html", ".xml.html"));
                    } else {
                        LOGGER.debug("OUT\t" + textAttr.filename);
                        String str8 = String.valueOf(input.getPath()) + fs + textAttr.filename;
                        FcFileUtils.delete(str8);
                        String replace2 = str8.replace("." + input_type, ".html");
                        FcFileUtils.delete(replace2);
                        FcFileUtils.delete(replace2.replace(".html", ".xml.html"));
                    }
                } else {
                    hashMap.put(str6, textAttr);
                }
                if (i2 / 1000 > i) {
                    i++;
                    LOGGER.info("Lists for more than " + (i * 1000) + " files have been checked.");
                }
            }
        }
        LOGGER.debug(str5);
        Iterator it = hashMap.keySet().iterator();
        int i3 = 0;
        try {
            BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(out_textfile.getAbsolutePath()), "UTF-8"));
            while (it.hasNext()) {
                bufferedWriter.write(String.valueOf((String.valueOf(file.getAbsolutePath()) + fs + ((TextAttr) hashMap.get((String) it.next())).filename).replace("\\", "/").trim()) + "\n");
                i3++;
            }
            bufferedWriter.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
            System.err.println("Error in writing the output text file. The file does not exist.");
        } catch (UnsupportedEncodingException e2) {
            e2.printStackTrace();
            System.err.println("Error in writing the output text file. The encoding is not supported.");
        } catch (IOException e3) {
            e3.printStackTrace();
            System.err.println("Error in writing the output text file.");
        }
        if (html) {
            try {
                writeHTMLfile(new File(str3).getAbsolutePath(), ReadResources.readFileAsString(out_textfile.getAbsolutePath()), z);
            } catch (IOException e4) {
                e4.printStackTrace();
            }
        }
        LOGGER.info("Deduplication completed in " + (System.currentTimeMillis() - currentTimeMillis) + " milliseconds. " + i3 + " files remained.");
    }

    private static byte[] calculate(String str) {
        HashMap hashMap = new HashMap();
        StringBuffer stringBuffer = new StringBuffer();
        int i = 0;
        for (int i2 = 0; i2 < str.length(); i2++) {
            char charAt = str.charAt(i2);
            if (Character.isLetterOrDigit(charAt)) {
                stringBuffer.append(Character.toLowerCase(charAt));
            } else if (stringBuffer.length() > 0) {
                if (stringBuffer.length() > MIN_TOKEN_LEN) {
                    String stringBuffer2 = stringBuffer.toString();
                    Token token = (Token) hashMap.get(stringBuffer2);
                    if (token == null) {
                        token = new Token(0, stringBuffer2);
                        hashMap.put(stringBuffer2, token);
                    }
                    token.cnt++;
                    if (token.cnt > i) {
                        i = token.cnt;
                    }
                }
                stringBuffer.setLength(0);
            }
        }
        if (stringBuffer.length() > MIN_TOKEN_LEN) {
            String stringBuffer3 = stringBuffer.toString();
            Token token2 = (Token) hashMap.get(stringBuffer3);
            if (token2 == null) {
                token2 = new Token(0, stringBuffer3);
                hashMap.put(stringBuffer3, token2);
            }
            token2.cnt++;
            if (token2.cnt > i) {
                i = token2.cnt;
            }
        }
        ArrayList arrayList = new ArrayList();
        int round = Math.round(i * QUANT_RATE);
        if (round < QAUNT_DEFAULT) {
            round = i > QAUNT_DEFAULT - 1 ? QAUNT_DEFAULT : 1;
        }
        for (Token token3 : hashMap.values()) {
            token3.cnt = (token3.cnt / round) * round;
            if (token3.cnt >= round) {
                arrayList.add(token3);
            }
        }
        if (arrayList.size() == 0) {
            System.out.println("Empty profile");
        }
        Collections.sort(arrayList, new TokenComparator(null));
        StringBuffer stringBuffer4 = new StringBuffer();
        Iterator it = arrayList.iterator();
        while (it.hasNext()) {
            Token token4 = (Token) it.next();
            if (stringBuffer4.length() > 0) {
                stringBuffer4.append("\n");
            }
            stringBuffer4.append(token4.toString());
        }
        return MD5Hash.digest(stringBuffer4.toString()).getDigest();
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public static void writeHTMLfile(String str, String str2, boolean z) {
        String[] split = str2.split("\n");
        try {
            OutputStreamWriter outputStreamWriter = new OutputStreamWriter(new FileOutputStream(str), "UTF-8");
            outputStreamWriter.write("<html xmlns=\"http://www.w3.org/1999/xhtml\">");
            for (String str3 : split) {
                File file = new File(str3);
                String replace = new File(String.valueOf(file.getAbsolutePath()) + ".html").getAbsolutePath().replace("\\", "/");
                outputStreamWriter.write("<br />" + (z ? "<a href=\"" + replace + "\">\n" + replace + "</a>" : "<a href=\"" + file.getAbsolutePath().replace("\\", "/") + "\">\n" + file.getAbsolutePath().replace("\\", "/") + "</a>").replace("\\", "/") + "\n");
            }
            outputStreamWriter.write("</html>");
            outputStreamWriter.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (UnsupportedEncodingException e2) {
            e2.printStackTrace();
        } catch (IOException e3) {
            e3.printStackTrace();
        }
    }

    public static void dedupnew(String str, String str2, String str3, boolean z) {
        String str4 = String.valueOf(str) + fs + "xml";
        int indexOf = str4.indexOf(ValueAggregatorDescriptor.TYPE_SEPARATOR);
        if (indexOf < 0 || indexOf < 2) {
            input = new File(str4);
        } else {
            input = new File(str4.substring(indexOf + 2, str4.length()));
        }
        if (!input.exists() || !input.isDirectory()) {
            System.err.println("the directory with the cesdoc files does not exist!!!!!!!!");
            System.exit(64);
        }
        out_textfile = new File(str2);
        if (!out_textfile.exists()) {
            System.err.println("List of cesdoc files does not exist!!!!!!!!");
            System.exit(64);
        }
        if (str3 != null) {
            html = true;
        }
        File file = new File(String.valueOf(input.getParent()) + fs + "xml");
        FilenameFilter filenameFilter = new FilenameFilter() { // from class: gr.ilsp.fmc.main.DedupMD5.2
            @Override // java.io.FilenameFilter
            public boolean accept(File file2, String str5) {
                return str5.substring(str5.length() - (DedupMD5.input_type.length() + 1)).equals("." + DedupMD5.input_type);
            }
        };
        File[] listFiles = input.listFiles(filenameFilter);
        if (listFiles.length < 2) {
            LOGGER.info("The input list contains less than 2 files.");
            return;
        }
        long currentTimeMillis = System.currentTimeMillis();
        HashMap hashMap = new HashMap();
        HashMap hashMap2 = new HashMap();
        int i = 0;
        HashMap hashMap3 = new HashMap();
        HashMap hashMap4 = new HashMap();
        for (int i2 = 0; i2 < listFiles.length; i2++) {
            String extractTextfromXML_clean = ReadResources.extractTextfromXML_clean(listFiles[i2].getAbsolutePath(), "p", "crawlinfo", false);
            String extractLangfromXML = ReadResources.extractLangfromXML(listFiles[i2].getAbsolutePath(), "language", "iso639");
            String[] split = extractTextfromXML_clean.split("\n");
            ArrayList arrayList = new ArrayList();
            ArrayList arrayList2 = new ArrayList();
            for (int i3 = 0; i3 < split.length; i3++) {
                String replaceAll = split[i3].replaceAll("[0-9]", "");
                ArrayList<String> arrayList3 = new ArrayList<>();
                try {
                    arrayList3 = TopicTools.analyze(replaceAll, extractLangfromXML);
                } catch (IOException e) {
                    LOGGER.warn(e.getMessage());
                }
                String str5 = "";
                Iterator<String> it = arrayList3.iterator();
                while (it.hasNext()) {
                    String next = it.next();
                    if (next.length() > min_tok_len) {
                        str5 = String.valueOf(str5) + ShingleFilter.TOKEN_SEPARATOR + next;
                    }
                }
                String trim = str5.replaceAll("(\\s){2,}", ShingleFilter.TOKEN_SEPARATOR).trim();
                if (trim.isEmpty()) {
                    LOGGER.debug(split[i3]);
                } else {
                    LOGGER.debug(split[i3]);
                    LOGGER.debug(trim);
                    String str6 = "";
                    for (byte b : MD5Hash.digest(trim).getDigest()) {
                        str6 = String.valueOf(str6) + ((int) b);
                    }
                    arrayList.add(str6);
                    arrayList2.add(Integer.valueOf(new StringTokenizer(split[i3]).countTokens()));
                }
            }
            hashMap.put(listFiles[i2].getName().toString(), new HashSet(arrayList));
            int i4 = 0;
            for (int i5 = 0; i5 < arrayList2.size(); i5++) {
                i4 += ((Integer) arrayList2.get(i5)).intValue();
            }
            hashMap2.put(listFiles[i2].getName().toString(), Integer.valueOf(i4));
            hashMap3.put(listFiles[i2].getName().toString(), arrayList2);
            hashMap4.put(listFiles[i2].getName().toString(), arrayList);
            if (i2 / 100 > i) {
                i++;
                LOGGER.info("Lists for more than " + (i * 100) + " files have been created.");
            }
        }
        Set<String> keySet = hashMap.keySet();
        HashSet hashSet = new HashSet();
        int i6 = 0;
        int i7 = 0;
        for (String str7 : keySet) {
            if (!hashSet.contains(str7)) {
                List list = (List) hashMap4.get(str7);
                double parseDouble = Double.parseDouble(Integer.toString(list.size()));
                if (parseDouble == 0.0d) {
                    LOGGER.debug("file " + str7 + "has empty feature vector.");
                } else {
                    i6++;
                    for (String str8 : keySet) {
                        if (!str8.equals(str7) && !(hashSet.contains(str7) | hashSet.contains(str8))) {
                            double d = 0.0d;
                            List list2 = (List) hashMap4.get(str8);
                            List list3 = (List) hashMap3.get(str8);
                            double parseDouble2 = Double.parseDouble(Integer.toString(list2.size()));
                            if (parseDouble2 == 0.0d) {
                                LOGGER.info("file " + str7 + "has empty feature vector.");
                            } else {
                                ArrayList arrayList4 = new ArrayList();
                                for (int size = list2.size() - 1; size > -1; size--) {
                                    String str9 = (String) list2.get(size);
                                    int i8 = 0;
                                    while (true) {
                                        if (i8 >= list.size()) {
                                            break;
                                        }
                                        if (((String) list.get(i8)).equals(str9) && (!arrayList4.contains(Integer.valueOf(i8)))) {
                                            d += ((Integer) list3.get(size)).intValue();
                                            arrayList4.add(Integer.valueOf(i8));
                                            break;
                                        }
                                        i8++;
                                    }
                                }
                                LOGGER.debug("CHECK: " + str7 + " with " + parseDouble + "\tpars TO\t" + str8 + " with " + parseDouble2 + "pars");
                                if (d / ((Integer) hashMap2.get(str8)).intValue() > inter_thr || d / ((Integer) hashMap2.get(str7)).intValue() > inter_thr) {
                                    LOGGER.debug(String.valueOf(str7) + " pair with " + str8);
                                    if (((Integer) hashMap2.get(str8)).intValue() > ((Integer) hashMap2.get(str7)).intValue()) {
                                        String str10 = String.valueOf(input.getPath()) + fs + str7;
                                        FcFileUtils.delete(str10);
                                        LOGGER.debug("deleted\t" + str7);
                                        String replace = str10.replace("." + input_type, ".html");
                                        FcFileUtils.delete(replace);
                                        FcFileUtils.delete(replace.replace(".html", ".xml.html"));
                                        hashSet.add(str7);
                                        hashMap4.remove(str7);
                                        hashMap3.remove(str7);
                                    } else {
                                        String str11 = String.valueOf(input.getPath()) + fs + str8;
                                        FcFileUtils.delete(str11);
                                        LOGGER.debug("deleted\t" + str8);
                                        String replace2 = str11.replace("." + input_type, ".html");
                                        FcFileUtils.delete(replace2);
                                        FcFileUtils.delete(replace2.replace(".html", ".xml.html"));
                                        hashSet.add(str8);
                                        hashMap4.remove(str8);
                                        hashMap3.remove(str8);
                                    }
                                }
                            }
                        }
                    }
                    hashSet.add(str7);
                    hashMap4.remove(str7);
                    hashMap3.remove(str7);
                    if (i6 / 1000 > i7) {
                        i7++;
                        LOGGER.info("more than " + (i7 * 1000) + " files have been checked.");
                    }
                }
            }
        }
        File[] listFiles2 = input.listFiles(filenameFilter);
        int i9 = 0;
        try {
            BufferedWriter bufferedWriter = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(out_textfile.getAbsolutePath()), "UTF-8"));
            for (File file2 : listFiles2) {
                bufferedWriter.write(String.valueOf((String.valueOf(file.getAbsolutePath()) + fs + file2.getName()).replace("\\", "/").trim()) + "\n");
                i9++;
            }
            bufferedWriter.close();
        } catch (FileNotFoundException e2) {
            e2.printStackTrace();
            System.err.println("Error in writing the output text file. The file does not exist.");
        } catch (UnsupportedEncodingException e3) {
            e3.printStackTrace();
            System.err.println("Error in writing the output text file. The encoding is not supported.");
        } catch (IOException e4) {
            e4.printStackTrace();
            System.err.println("Error in writing the output text file.");
        }
        if (html) {
            try {
                writeHTMLfile(new File(str3).getAbsolutePath(), ReadResources.readFileAsString(out_textfile.getAbsolutePath()), z);
            } catch (IOException e5) {
                e5.printStackTrace();
            }
        }
        LOGGER.info("New Deduplication completed in " + (System.currentTimeMillis() - currentTimeMillis) + " milliseconds. " + i9 + " files remained.");
    }
}
