package gr.ilsp.fmc.parser;

import bixo.parser.BaseContentExtractor;
import gr.ilsp.fmc.datums.ExtendedParsedDatum;
import gr.ilsp.fmc.extractors.Pdf2text;
import gr.ilsp.fmc.utils.ContentNormalizer;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.net.URL;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.Callable;
import org.apache.log4j.Logger;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.tika.metadata.CreativeCommons;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.MSOffice;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.Parser;

/* loaded from: input_file:gr/ilsp/fmc/parser/PdfboxCallableParser.class */
public class PdfboxCallableParser implements Callable<ExtendedParsedDatum> {
    private InputStream _input;
    private Metadata _metadata;
    private String _storedir_path;
    private boolean _sort_type;
    private static final String EUROPE_ORG_STR = "europa.eu";
    private static final String default_Europecomment_in_url = "©European Union, 1995-2014. Reuse is authorised, provided the source is acknowledged.";
    private static String fs1 = System.getProperty("file.separator");
    private static final Logger LOGGER = Logger.getLogger(PdfboxCallableParser.class);

    public PdfboxCallableParser(Parser parser, BaseContentExtractor baseContentExtractor, InputStream inputStream, Metadata metadata, String str) {
        this(parser, baseContentExtractor, inputStream, metadata, true, false, str);
    }

    public PdfboxCallableParser(Parser parser, BaseContentExtractor baseContentExtractor, InputStream inputStream, Metadata metadata, boolean z, boolean z2, String str) {
        this._sort_type = false;
        this._input = inputStream;
        this._metadata = metadata;
        this._storedir_path = str;
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // java.util.concurrent.Callable
    public ExtendedParsedDatum call() throws Exception {
        try {
            if (this._storedir_path.startsWith("file:/")) {
                this._storedir_path = this._storedir_path.substring(5);
            }
            File file = new File(String.valueOf(this._storedir_path) + fs1 + "pdf");
            String str = String.valueOf(file.getAbsolutePath()) + fs1 + file.listFiles().length + ".pdf";
            BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(new FileOutputStream(str));
            InputStream inputStream = new URL(this._metadata.get("Content-Location")).openConnection().getInputStream();
            byte[] bArr = new byte[1024];
            long j = 0;
            while (true) {
                int read = inputStream.read(bArr);
                if (read == -1) {
                    break;
                }
                bufferedOutputStream.write(bArr, 0, read);
                j += read;
            }
            inputStream.close();
            bufferedOutputStream.close();
            LOGGER.info(String.valueOf(str) + " saved.");
            String run1 = Pdf2text.run1(new File(str), this._sort_type);
            if (run1 == null) {
                LOGGER.info("PDF to Text Conversion failed.");
            } else {
                run1 = ContentNormalizer.normalizeText(run1);
            }
            this._metadata.set("Content-Encoding", "UTF-8");
            this._metadata.set("comment", str);
            ExtendedOutlink[] links = ExtendedLinksExtractor.getLinks(this._input, this._metadata);
            String str2 = this._metadata.get("Content-Location");
            if (str2.contains(EUROPE_ORG_STR)) {
                this._metadata.set(CreativeCommons.LICENSE_URL, default_Europecomment_in_url);
            }
            LOGGER.debug(String.valueOf(str2) + this._metadata.get(CreativeCommons.LICENSE_URL));
            PDDocument pDDocument = null;
            try {
                pDDocument = PDDocument.load(str);
                new PDDocumentInformation();
                PDDocumentInformation documentInformation = pDDocument.getDocumentInformation();
                this._metadata.set(MSOffice.AUTHOR, documentInformation.getAuthor());
                this._metadata.set("title", documentInformation.getTitle());
                this._metadata.set(MSOffice.KEYWORDS, documentInformation.getKeywords());
                this._metadata.set(DublinCore.PUBLISHER, documentInformation.getProducer());
                if (pDDocument != null) {
                    pDDocument.close();
                }
            } catch (Exception e) {
                System.out.println("An exception occured in parsing the PDF Document.");
                e.printStackTrace();
                if (pDDocument != null) {
                    pDDocument.close();
                }
            }
            return new ExtendedParsedDatum(this._metadata.get(TikaMetadataKeys.RESOURCE_NAME_KEY), null, run1, "", this._metadata.get("title"), links, makeMap(this._metadata));
        } catch (Exception e2) {
            throw e2;
        } catch (NoSuchMethodError e3) {
            throw new RuntimeException("Attempting to use excluded parser");
        } catch (Throwable th) {
            throw new RuntimeException("Serious shut-down error thrown from PDFBOX", th);
        }
    }

    private static Map<String, String> makeMap(Metadata metadata) {
        HashMap hashMap = new HashMap();
        for (String str : metadata.names()) {
            hashMap.put(str, metadata.get(str));
        }
        return hashMap;
    }
}
