package gr.ilsp.fmc.parser;

import bixo.datum.UrlDatum;
import bixo.urls.BaseUrlFilter;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.regex.Pattern;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.mapred.JobConf;
import org.apache.log4j.Logger;
import org.mortbay.util.StringUtil;

/* loaded from: input_file:gr/ilsp/fmc/parser/DomainUrlFilter.class */
public class DomainUrlFilter extends BaseUrlFilter {
    private static final Logger LOGGER = Logger.getLogger(DomainUrlFilter.class);
    private ArrayList<String> _domain = new ArrayList<>();
    private Pattern _suffixExclusionPattern;
    private Pattern _protocolInclusionPattern;

    public DomainUrlFilter(String str) {
        if (str != null) {
            this._domain.add(str);
        }
        this._suffixExclusionPattern = Pattern.compile("(?i)\\.(zip|gzip|gz|sit|bz|bz2|tar|tgz|exe|arff|au|avi|class|fig|gif|hqx|ica|jpeg|jpg|mat|mdb|mov|mp3|mpeg|mpg|msi|pcx|pdb|psd|ram|rar|raw|rmd|rmx|sav|sdd|shar|tga|tif|tiff|vo|wav|wmv|wmz|xbm|xpm|z)$");
        this._protocolInclusionPattern = Pattern.compile("(?i)^(http|https)://");
    }

    public DomainUrlFilter(Path path) {
        JobConf jobConf = new JobConf();
        this._suffixExclusionPattern = Pattern.compile("(?i)\\.(zip|gzip|gz|sit|bz|bz2|tar|tgz|exe|arff|au|avi|class|fig|gif|hqx|ica|jpeg|jpg|mat|mdb|mov|mp3|mpeg|mpg|msi|pcx|pdb|psd|ram|rar|raw|rmd|rmx|sav|sdd|shar|tga|tif|tiff|vo|wav|wmv|wmz|xbm|xpm|z)$");
        this._protocolInclusionPattern = Pattern.compile("(?i)^(http|https)://");
        try {
            BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(path.getFileSystem(jobConf).open(path), StringUtil.__UTF8Alt));
            while (true) {
                String readLine = bufferedReader.readLine();
                String str = readLine;
                if (readLine == null) {
                    bufferedReader.close();
                    return;
                } else if (!str.isEmpty()) {
                    this._domain.add(str.contains("/") ? str.substring(0, str.length() - 1) : str);
                }
            }
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        } catch (IOException e2) {
            e2.printStackTrace();
        }
    }

    @Override // bixo.urls.BaseUrlFilter
    public boolean isRemove(UrlDatum urlDatum) {
        String url = urlDatum.getUrl();
        if (url.length() < 7 || url.startsWith("ftp") || url.equals("http:/") || url.isEmpty() || !this._protocolInclusionPattern.matcher(url).find() || this._suffixExclusionPattern.matcher(url).find()) {
            return true;
        }
        try {
            URL url2 = new URL(url);
            if (this._domain.size() == 0) {
                return false;
            }
            String host = url2.getHost();
            Iterator<String> it = this._domain.iterator();
            while (it.hasNext()) {
                String next = it.next();
                if (next != null && host.endsWith(next)) {
                    return false;
                }
            }
            return true;
        } catch (MalformedURLException e) {
            LOGGER.warn("Invalid URL: " + url);
            return true;
        }
    }
}
