/*
 * Decompiled with CFR 0.152.
 */
package it.jrc.osint.extract.text.internal;

import com.ibm.icu.text.CharsetDetector;
import com.ibm.icu.text.CharsetMatch;
import it.jrc.emmutils.Utils;
import it.jrc.htmlparser.HTMLDocument;
import it.jrc.htmlparser.HTMLDocumentBuilder;
import it.jrc.htmlparser.TextFilter;
import it.jrc.osint.DocumentMetaItem;
import it.jrc.osint.extract.text.ErrorConditions;
import it.jrc.osint.extract.text.TextExtractPlugin;
import it.jrc.osint.extract.text.TextExtractionPreferences;
import it.jrc.osint.extract.text.TextExtractionProvider;
import it.jrc.osint.logging.LogManager;
import it.jrc.osint.logging.Logger;
import it.jrc.osint.metadata.AcquisitionMetaData;
import it.jrc.osint.metadata.ErrorStatusMetaData;
import it.jrc.osint.metadata.RepositoryMetaData;
import it.jrc.osint.operations.OperationStatus;
import it.jrc.osint.util.io.FileUtil;
import java.io.InputStream;

public class HTMLTextExtractionProvider
implements TextExtractionProvider {
    private static final Logger log = LogManager.getLogger((String)HTMLTextExtractionProvider.class.getCanonicalName());

    @Override
    public OperationStatus extract(InputStream in, DocumentMetaItem anItem) {
        TextExtractionPreferences prefs = TextExtractPlugin.getPlugin().getPreferences();
        if (prefs == null) {
            return this.performExtract(in, anItem, false);
        }
        if (prefs.useContentTextExtractionMode()) {
            return this.performExtract(in, anItem, true);
        }
        return this.performExtract(in, anItem, false);
    }

    private OperationStatus performExtract(InputStream in, DocumentMetaItem anItem, boolean contentPageExtraction) {
        String text = null;
        OperationStatus resultStatus = OperationStatus.OK_STATUS;
        ErrorStatusMetaData emd = new ErrorStatusMetaData(anItem);
        RepositoryMetaData rmd = new RepositoryMetaData(anItem);
        try {
            String encoding = this.determineEncoding(in, anItem);
            HTMLDocumentBuilder docBuilder = new HTMLDocumentBuilder();
            HTMLDocument theDoc = docBuilder.parse(in, encoding);
            String title = theDoc.getTitle();
            anItem.setTitle(title);
            TextFilter textFilter = new TextFilter();
            boolean parsingSuccess = false;
            parsingSuccess = contentPageExtraction ? textFilter.parse(theDoc, null) : textFilter.parseAllText(theDoc, null);
            if (parsingSuccess) {
                text = textFilter.getPlainText();
                if (text == null || "".equals(text.trim())) {
                    log.error("Text Extraction failed for item:" + anItem.getResourceId());
                    resultStatus = new OperationStatus(4, ErrorConditions.getErrorConditions().getErrorCondition(ErrorConditions.TEXT_EXTRACTION_FAILED));
                    emd.setOperationStatus(resultStatus);
                    rmd.setTextExtracted(false);
                    OperationStatus operationStatus = resultStatus;
                    return operationStatus;
                }
                anItem.setText(text);
                if (anItem.getDescription() == null || anItem.getDescription().equals("") || anItem.getDescription().length() < 30) {
                    anItem.setDescription(textFilter.getDescription());
                }
                anItem.setGuid(Utils.createGuid((String)text));
                rmd.setTextExtracted(true);
                OperationStatus operationStatus = resultStatus;
                return operationStatus;
            }
            resultStatus = new OperationStatus(4, ErrorConditions.getErrorConditions().getErrorCondition(ErrorConditions.TEXT_EXTRACTION_FAILED));
            emd.setOperationStatus(resultStatus);
            rmd.setTextExtracted(false);
            OperationStatus operationStatus = resultStatus;
            return operationStatus;
        }
        catch (Throwable t) {
            log.error("Failed to extract text from html for link: " + anItem.getLink(), t);
            resultStatus = new OperationStatus(4, ErrorConditions.getErrorConditions().getErrorCondition(ErrorConditions.TEXT_EXTRACTION_FAILED, t));
            emd.setOperationStatus(resultStatus);
            rmd.setTextExtracted(false);
            OperationStatus operationStatus = resultStatus;
            return operationStatus;
        }
        finally {
            FileUtil.close((InputStream)in);
        }
    }

    private String determineEncoding(InputStream in, DocumentMetaItem metaItem) {
        String encoding = "UTF-8";
        AcquisitionMetaData amd = new AcquisitionMetaData(metaItem);
        String acquisitionSource = amd.getSource();
        if (acquisitionSource != null && !"".equals(acquisitionSource) && "Import local disk".equals(acquisitionSource)) {
            try {
                CharsetDetector detector = new CharsetDetector();
                detector.setText(in);
                CharsetMatch cm = detector.detect();
                String detectedEncdoing = cm.getName();
                if (detectedEncdoing != null && !"".equals(detectedEncdoing)) {
                    encoding = detectedEncdoing;
                }
            }
            catch (Throwable t) {
                log.error("Failed to determine text encoding", t);
            }
        }
        return encoding;
    }
}

