package org.apache.cocoon.components.crawler;

import java.net.URL;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import org.apache.avalon.excalibur.pool.Recyclable;
import org.apache.avalon.framework.activity.Disposable;
import org.apache.avalon.framework.configuration.Configurable;
import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.avalon.framework.logger.AbstractLogEnabled;
import org.apache.cocoon.Constants;
import org.apache.cocoon.util.Tokenizer;
import org.apache.regexp.RE;
import org.apache.regexp.RESyntaxException;

/* loaded from: input_file:org/apache/cocoon/components/crawler/SimpleCocoonCrawlerImpl.class */
public class SimpleCocoonCrawlerImpl extends AbstractLogEnabled implements CocoonCrawler, Configurable, Disposable, Recyclable {
    public static final String LINK_CONTENT_TYPE_CONFIG = "link-content-type";
    public static final String LINK_VIEW_QUERY_CONFIG = "link-view-query";
    public static final String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links";
    public static final String EXCLUDE_CONFIG = "exclude";
    public static final String INCLUDE_CONFIG = "include";
    public static final String USER_AGENT_CONFIG = "user-agent";
    public static final String USER_AGENT_DEFAULT = "Apache Cocoon 2.1-dev";
    public static final String ACCEPT_CONFIG = "accept";
    public static final String ACCEPT_DEFAULT = "*/*";
    private HashSet crawled;
    private HashSet urlsToProcess;
    public final String LINK_CONTENT_TYPE_DEFAULT = Constants.LINK_CONTENT_TYPE;
    private String linkViewQuery = "cocoon-view=links";
    private String linkContentType = Constants.LINK_CONTENT_TYPE;
    private String userAgent = "Apache Cocoon 2.1-dev";
    private String accept = "*/*";
    private HashSet includeCrawlingURL = null;
    private HashSet excludeCrawlingURL = null;

    /* loaded from: input_file:org/apache/cocoon/components/crawler/SimpleCocoonCrawlerImpl$CocoonCrawlerIterator.class */
    public static class CocoonCrawlerIterator implements Iterator {
        private SimpleCocoonCrawlerImpl cocoonCrawler;

        CocoonCrawlerIterator(SimpleCocoonCrawlerImpl simpleCocoonCrawlerImpl) {
            this.cocoonCrawler = simpleCocoonCrawlerImpl;
        }

        @Override // java.util.Iterator
        public boolean hasNext() {
            return this.cocoonCrawler.urlsToProcess.size() > 0;
        }

        @Override // java.util.Iterator
        public Object next() {
            URL url = null;
            Iterator it = this.cocoonCrawler.urlsToProcess.iterator();
            if (it.hasNext()) {
                url = (URL) it.next();
                this.cocoonCrawler.urlsToProcess.remove(url);
                List links = this.cocoonCrawler.getLinks(url);
                if (links != null) {
                    this.cocoonCrawler.urlsToProcess.addAll(links);
                }
            }
            return url;
        }

        @Override // java.util.Iterator
        public void remove() {
            throw new UnsupportedOperationException("remove is not implemented");
        }
    }

    public void configure(Configuration configuration) throws ConfigurationException {
        String value;
        String value2;
        String value3;
        String value4;
        Configuration[] children = configuration.getChildren("include");
        if (children.length > 0) {
            this.includeCrawlingURL = new HashSet();
            for (Configuration configuration2 : children) {
                String value5 = configuration2.getValue();
                try {
                    Tokenizer tokenizer = new Tokenizer(value5, ", ");
                    while (tokenizer.hasMoreTokens()) {
                        this.includeCrawlingURL.add(new RE(tokenizer.nextToken()));
                    }
                } catch (RESyntaxException e) {
                    getLogger().error(new StringBuffer().append("Cannot create including regular-expression for ").append(value5).toString(), e);
                }
            }
        } else if (getLogger().isDebugEnabled()) {
            getLogger().debug("Include all URLs");
        }
        Configuration[] children2 = configuration.getChildren("exclude");
        if (children2.length > 0) {
            this.excludeCrawlingURL = new HashSet();
            for (Configuration configuration3 : children2) {
                String value6 = configuration3.getValue();
                try {
                    Tokenizer tokenizer2 = new Tokenizer(value6, ", ");
                    while (tokenizer2.hasMoreTokens()) {
                        this.excludeCrawlingURL.add(new RE(tokenizer2.nextToken()));
                    }
                } catch (RESyntaxException e2) {
                    getLogger().error(new StringBuffer().append("Cannot create excluding regular-expression for ").append(value6).toString(), e2);
                }
            }
        } else {
            this.excludeCrawlingURL = new HashSet();
            setDefaultExcludeFromCrawling();
            if (getLogger().isDebugEnabled()) {
                getLogger().debug("Exclude default URLs only");
            }
        }
        Configuration child = configuration.getChild("link-content-type", false);
        if (child != null && (value4 = child.getValue()) != null && value4.length() > 0) {
            this.linkContentType = value4.trim();
        }
        Configuration child2 = configuration.getChild("link-view-query", false);
        if (child2 != null && (value3 = child2.getValue()) != null && value3.length() > 0) {
            this.linkViewQuery = value3.trim();
        }
        Configuration child3 = configuration.getChild("user-agent", false);
        if (child3 != null && (value2 = child3.getValue()) != null && value2.length() > 0) {
            this.userAgent = value2;
        }
        Configuration child4 = configuration.getChild("accept", false);
        if (child4 == null || (value = child4.getValue()) == null || value.length() <= 0) {
            return;
        }
        this.accept = value;
    }

    public void dispose() {
        this.crawled = null;
        this.urlsToProcess = null;
        this.excludeCrawlingURL = null;
        this.includeCrawlingURL = null;
    }

    public void recycle() {
        this.crawled = null;
        this.urlsToProcess = null;
    }

    @Override // org.apache.cocoon.components.crawler.CocoonCrawler
    public void crawl(URL url) {
        this.crawled = new HashSet();
        this.urlsToProcess = new HashSet();
        if (getLogger().isDebugEnabled()) {
            getLogger().debug(new StringBuffer().append("crawl URL ").append(url).toString());
        }
        this.urlsToProcess.add(url);
    }

    @Override // org.apache.cocoon.components.crawler.CocoonCrawler
    public Iterator iterator() {
        return new CocoonCrawlerIterator(this);
    }

    private void setDefaultExcludeFromCrawling() {
        for (String str : new String[]{".*\\.gif(\\?.*)?$", ".*\\.png(\\?.*)?$", ".*\\.jpe?g(\\?.*)?$", ".*\\.js(\\?.*)?$", ".*\\.css(\\?.*)?$"}) {
            try {
                this.excludeCrawlingURL.add(new RE(str));
            } catch (RESyntaxException e) {
                getLogger().error(new StringBuffer().append("Cannot create excluding regular-expression for ").append(str).toString(), e);
            }
        }
    }

    /*  JADX ERROR: JadxRuntimeException in pass: BlockProcessor
        jadx.core.utils.exceptions.JadxRuntimeException: Unreachable block: B:26:0x0249
        	at jadx.core.dex.visitors.blocks.BlockProcessor.checkForUnreachableBlocks(BlockProcessor.java:88)
        	at jadx.core.dex.visitors.blocks.BlockProcessor.processBlocksTree(BlockProcessor.java:52)
        	at jadx.core.dex.visitors.blocks.BlockProcessor.visit(BlockProcessor.java:44)
        */
    /* JADX INFO: Access modifiers changed from: private */
    public java.util.List getLinks(java.net.URL r8) {
        /*
            Method dump skipped, instructions count: 591
            To view this dump add '--comments-level debug' option
        */
        throw new UnsupportedOperationException("Method not decompiled: org.apache.cocoon.components.crawler.SimpleCocoonCrawlerImpl.getLinks(java.net.URL):java.util.List");
    }

    private boolean isExcludedURL(String str) {
        if (this.excludeCrawlingURL == null) {
            return false;
        }
        String str2 = str.toString();
        Iterator it = this.excludeCrawlingURL.iterator();
        while (it.hasNext()) {
            if (((RE) it.next()).match(str2)) {
                if (!getLogger().isDebugEnabled()) {
                    return true;
                }
                getLogger().debug(new StringBuffer().append("Excluded URL ").append(str).toString());
                return true;
            }
        }
        if (!getLogger().isDebugEnabled()) {
            return false;
        }
        getLogger().debug(new StringBuffer().append("Not excluded URL ").append(str).toString());
        return false;
    }

    private boolean isIncludedURL(String str) {
        if (this.includeCrawlingURL == null) {
            return true;
        }
        String str2 = str.toString();
        Iterator it = this.includeCrawlingURL.iterator();
        while (it.hasNext()) {
            if (((RE) it.next()).match(str2)) {
                if (!getLogger().isDebugEnabled()) {
                    return true;
                }
                getLogger().debug(new StringBuffer().append("Included URL ").append(str).toString());
                return true;
            }
        }
        if (!getLogger().isDebugEnabled()) {
            return false;
        }
        getLogger().debug(new StringBuffer().append("Not included URL ").append(str).toString());
        return false;
    }
}
