package org.webslinger.nutch;

import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.html.dom.HTMLDocumentImpl;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.parse.html.DOMBuilder;
import org.apache.nutch.parse.html.DOMContentUtils;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.LogUtil;
import org.ccil.cowan.tagsoup.Parser;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/* loaded from: input_file:org/webslinger/nutch/SearchableContentFilter.class */
public class SearchableContentFilter implements HtmlParseFilter {
    public static final Log LOG = LogFactory.getLog("org.webslinger.nutch");
    private Configuration conf;
    private XPathExpression xPathExpression;
    private String parserImpl;
    private DOMContentUtils utils;

    public DOMContentUtils getUtils() {
        return this.utils;
    }

    public void setConf(Configuration configuration) {
        this.conf = configuration;
        this.parserImpl = getConf().get("parser.html.impl", "neko");
        this.utils = new DOMContentUtils(configuration);
        String str = getConf().get("filter.webslinger.contentPath");
        if (str == null || str.length() <= 0) {
            this.xPathExpression = null;
            return;
        }
        try {
            this.xPathExpression = XPathFactory.newInstance().newXPath().compile(str);
        } catch (XPathExpressionException e) {
            e.printStackTrace();
            this.xPathExpression = null;
        }
    }

    public Configuration getConf() {
        return this.conf;
    }

    public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags hTMLMetaTags, DocumentFragment documentFragment) {
        NodeList nodeList;
        if (this.xPathExpression == null) {
            return parseResult;
        }
        try {
            nodeList = (NodeList) this.xPathExpression.evaluate(documentFragment, XPathConstants.NODESET);
        } catch (XPathExpressionException e) {
            Parse emptyParse = new ParseStatus(e).getEmptyParse(getConf());
            parseResult.put(content.getUrl(), new ParseText(emptyParse.getText()), emptyParse.getData());
        }
        if (nodeList.getLength() == 0) {
            return parseResult;
        }
        StringBuffer stringBuffer = new StringBuffer();
        for (int i = 0; i < nodeList.getLength(); i++) {
            this.utils.getText(stringBuffer, nodeList.item(i));
        }
        parseResult.put(content.getUrl(), new ParseText(stringBuffer.toString()), parseResult.get(content.getUrl()).getData());
        return parseResult;
    }

    private DocumentFragment parse(InputSource inputSource) throws Exception {
        return this.parserImpl.equalsIgnoreCase("tagsoup") ? parseTagSoup(inputSource) : parseNeko(inputSource);
    }

    private DocumentFragment parseTagSoup(InputSource inputSource) throws Exception {
        HTMLDocumentImpl hTMLDocumentImpl = new HTMLDocumentImpl();
        DocumentFragment createDocumentFragment = hTMLDocumentImpl.createDocumentFragment();
        DOMBuilder dOMBuilder = new DOMBuilder(hTMLDocumentImpl, createDocumentFragment);
        Parser parser = new Parser();
        parser.setContentHandler(dOMBuilder);
        parser.setFeature("http://www.ccil.org/~cowan/tagsoup/features/ignore-bogons", true);
        parser.setFeature("http://www.ccil.org/~cowan/tagsoup/features/bogons-empty", false);
        parser.setProperty("http://xml.org/sax/properties/lexical-handler", dOMBuilder);
        parser.parse(inputSource);
        return createDocumentFragment;
    }

    private DocumentFragment parseNeko(InputSource inputSource) throws Exception {
        DOMFragmentParser dOMFragmentParser = new DOMFragmentParser();
        try {
            dOMFragmentParser.setFeature("http://apache.org/xml/features/include-comments", true);
            dOMFragmentParser.setFeature("http://apache.org/xml/features/augmentations", true);
            dOMFragmentParser.setFeature("http://cyberneko.org/html/features/balance-tags/ignore-outside-content", false);
            dOMFragmentParser.setFeature("http://cyberneko.org/html/features/balance-tags/document-fragment", true);
            dOMFragmentParser.setFeature("http://cyberneko.org/html/features/report-errors", true);
        } catch (SAXException e) {
        }
        HTMLDocumentImpl hTMLDocumentImpl = new HTMLDocumentImpl();
        hTMLDocumentImpl.setErrorChecking(false);
        DocumentFragment createDocumentFragment = hTMLDocumentImpl.createDocumentFragment();
        DocumentFragment createDocumentFragment2 = hTMLDocumentImpl.createDocumentFragment();
        dOMFragmentParser.parse(inputSource, createDocumentFragment2);
        createDocumentFragment.appendChild(createDocumentFragment2);
        while (true) {
            try {
                DocumentFragment createDocumentFragment3 = hTMLDocumentImpl.createDocumentFragment();
                dOMFragmentParser.parse(inputSource, createDocumentFragment3);
                if (!createDocumentFragment3.hasChildNodes()) {
                    break;
                }
                if (LOG.isInfoEnabled()) {
                    LOG.info(" - new frag, " + createDocumentFragment3.getChildNodes().getLength() + " nodes.");
                }
                createDocumentFragment.appendChild(createDocumentFragment3);
            } catch (Exception e2) {
                e2.printStackTrace(LogUtil.getWarnStream(LOG));
            }
        }
        return createDocumentFragment;
    }
}
