/*
 * Decompiled with CFR 0.152.
 */
package org.apache.nutch.protocol;

import crawlercommons.robots.BaseRobotRules;
import crawlercommons.robots.SimpleRobotRules;
import crawlercommons.robots.SimpleRobotRulesParser;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.LineNumberReader;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Collection;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolNotFound;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.SuffixStringMatcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public abstract class RobotRulesParser
implements Tool {
    private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
    protected static final Hashtable<String, BaseRobotRules> CACHE = new Hashtable();
    public static final BaseRobotRules EMPTY_RULES = new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_ALL);
    public static BaseRobotRules FORBID_ALL_RULES = new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_NONE);
    public static final BaseRobotRules DEFER_VISIT_RULES = new SimpleRobotRules(SimpleRobotRules.RobotRulesMode.ALLOW_NONE);
    private static SimpleRobotRulesParser robotParser;
    protected Configuration conf;
    protected Set<String> agentNames;
    protected int maxNumRedirects = 5;
    protected Set<String> allowList = new HashSet<String>();
    private SuffixStringMatcher matcher = null;

    public RobotRulesParser() {
    }

    public RobotRulesParser(Configuration conf) {
        this.setConf(conf);
    }

    public void setConf(Configuration conf) {
        String[] otherAgents;
        this.conf = conf;
        String agentName = conf.get("http.agent.name");
        if (agentName == null || (agentName = agentName.trim()).isEmpty()) {
            throw new RuntimeException("Agent name not configured!");
        }
        this.agentNames = new LinkedHashSet<String>();
        if (!agentName.equals("*")) {
            this.agentNames.add(agentName.toLowerCase());
        }
        if ((otherAgents = conf.getStrings("http.robots.agents")) != null && otherAgents.length > 0) {
            for (String otherAgent : otherAgents) {
                if ((otherAgent = otherAgent.toLowerCase()).equals("*") || otherAgent.equalsIgnoreCase(agentName)) continue;
                this.agentNames.add(otherAgent);
            }
        }
        LOG.info("Checking robots.txt for the following agent names: {}", this.agentNames);
        this.maxNumRedirects = conf.getInt("http.robots.redirect.max", 5);
        LOG.info("Following max. {} robots.txt redirects", (Object)this.maxNumRedirects);
        String[] confAllowList = conf.getStrings("http.robot.rules.allowlist");
        if (confAllowList == null) {
            LOG.debug("robots.txt allowlist not configured.");
        } else {
            for (int i = 0; i < confAllowList.length; ++i) {
                if (confAllowList[i].isEmpty()) {
                    LOG.info("Empty allowlisted URL skipped!");
                    continue;
                }
                this.allowList.add(confAllowList[i]);
            }
            if (this.allowList.size() > 0) {
                this.matcher = new SuffixStringMatcher(this.allowList);
                LOG.info("Allowlisted hosts: {}", this.allowList);
            }
        }
    }

    public Configuration getConf() {
        return this.conf;
    }

    public boolean isAllowListed(URL url) {
        boolean match = false;
        String urlString = url.getHost();
        if (this.matcher != null) {
            match = this.matcher.matches(urlString);
        }
        return match;
    }

    @Deprecated
    public BaseRobotRules parseRules(String url, byte[] content, String contentType, String robotName) {
        return robotParser.parseContent(url, content, contentType, robotName);
    }

    public BaseRobotRules parseRules(String url, byte[] content, String contentType, Collection<String> robotNames) {
        return robotParser.parseContent(url, content, contentType, robotNames);
    }

    public BaseRobotRules getRobotRulesSet(Protocol protocol, Text url, List<Content> robotsTxtContent) {
        URL u = null;
        try {
            u = new URL(url.toString());
        }
        catch (Exception e) {
            return EMPTY_RULES;
        }
        return this.getRobotRulesSet(protocol, u, robotsTxtContent);
    }

    public abstract BaseRobotRules getRobotRulesSet(Protocol var1, URL var2, List<Content> var3);

    public int run(String[] args) {
        if (args.length < 2) {
            String[] help;
            for (String s : help = new String[]{"Usage: RobotRulesParser [ -Dproperty=... ] <robots-file-or-url> <url-file> [<agent-names>]", "", "<robots-file-or-url>\tlocal file or URL parsed as robots.txt file", "\tIf <robots-file-or-url> starts with a protocol specification", "\t(`http', `https', `ftp' or `file'), the URL is parsed, URL path", "\tand query are removed and the path \"/robots.txt\" is appended.", "\tThe resulting URL (the canonical robots.txt location) is then", "\tfetched using the specified protocol.", "\tIf the URL does not include a protocol, a local file is assumed.", "", "<url-file>\tlocal file with URLs (one per line), for every URL", "\tthe path part (including the query) is checked whether", "\tit is allowed by the robots.txt rules.  Other parts of the URLs", "\t(mainly the host) are ignored.", "", "<agent-names>\tuser-agent name (aka. \"product token\")", "\tused to select rules from the robots.txt file.", "\tMultiple agent names can be passed as comma-separated string.", "\tIf no agent name is given the properties http.agent.name", "\tand http.robots.agents are used.", "\tIf also http.agent.name and http.robots.agents are empty,", "\trobots.txt is checked for rules assigned to the user", "\tagent `*' (meaning any other).", "", "Important properties:", " -D fetcher.store.robotstxt=true", "\toutput content and HTTP meta data of fetched robots.txt (if not a local file)", " -D http.agent.name=...\t(primary) agent name", " -D http.robots.agents=...\tadditional agent names", " -D http.robot.rules.allowlist=..."}) {
                System.err.println(s);
            }
            return -1;
        }
        if (args.length > 2) {
            String agents = args[2];
            this.conf.set("http.robots.agents", agents);
            this.conf.set("http.agent.name", agents.split(",")[0]);
            this.setConf(this.conf);
        }
        Protocol protocol = null;
        URL robotsTxtUrl = null;
        if (args[0].matches("^(?:https?|ftp|file)://?.*")) {
            try {
                robotsTxtUrl = new URL(args[0]);
            }
            catch (MalformedURLException e) {
                LOG.warn("Not a valid URL, assuming local file: {}", (Object)args[0]);
            }
            ProtocolFactory factory = new ProtocolFactory(this.conf);
            try {
                protocol = factory.getProtocol(robotsTxtUrl);
                LOG.debug("Using protocol {} to fetch robots.txt", protocol.getClass());
            }
            catch (ProtocolNotFound e) {
                LOG.error("No protocol found for {}: {}", (Object)args[0], (Object)StringUtils.stringifyException((Throwable)e));
                return -1;
            }
        }
        if (robotsTxtUrl == null) {
            File robotsFile = new File(args[0]);
            if (!robotsFile.exists()) {
                LOG.error("File does not exist: {}", (Object)args[0]);
                return -1;
            }
            try {
                robotsTxtUrl = robotsFile.toURI().toURL();
            }
            catch (MalformedURLException e) {
                // empty catch block
            }
        }
        File urlFile = new File(args[1]);
        LinkedList<Content> robotsTxtContent = null;
        if (this.getConf().getBoolean("fetcher.store.robotstxt", false)) {
            robotsTxtContent = new LinkedList<Content>();
        }
        try {
            BaseRobotRules rules = this.getRobotRulesSet(protocol, robotsTxtUrl, robotsTxtContent);
            LOG.debug("Robots.txt rules:\n{}", (Object)rules);
            if (robotsTxtContent != null) {
                for (Content robotsTxt : robotsTxtContent) {
                    LOG.info("fetched robots.txt {}:", (Object)robotsTxt.getUrl());
                    LOG.info(robotsTxt.toString());
                }
            }
            System.out.println("Testing robots.txt for agent names: " + String.valueOf(this.agentNames.isEmpty() ? "* (any other agent)" : this.agentNames));
            LineNumberReader testsIn = new LineNumberReader(new FileReader(urlFile));
            String testPath = testsIn.readLine();
            while (testPath != null) {
                testPath = testPath.trim();
                try {
                    URL url = new URL(testPath);
                    String status = this.isAllowListed(url) ? "allowlisted" : (rules.isAllowed(testPath) ? "allowed" : "not allowed");
                    System.out.println(status + ":\t" + testPath);
                }
                catch (MalformedURLException e) {
                    LOG.warn("Not a valid URL: {}", (Object)testPath);
                }
                testPath = testsIn.readLine();
            }
            testsIn.close();
        }
        catch (IOException e) {
            LOG.error("Failed to run:", (Throwable)e);
            return -1;
        }
        return 0;
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = NutchConfiguration.create();
        int res = ToolRunner.run((Configuration)conf, (Tool)new TestRobotRulesParser(), (String[])args);
        System.exit(res);
    }

    static {
        DEFER_VISIT_RULES.setDeferVisits(true);
        robotParser = new SimpleRobotRulesParser();
        robotParser.setMaxCrawlDelay(Long.MAX_VALUE);
    }

    private static class TestRobotRulesParser
    extends RobotRulesParser {
        private TestRobotRulesParser() {
        }

        @Override
        public void setConf(Configuration conf) {
            if (conf.get("http.agent.name", "").isEmpty()) {
                String firstRobotsAgent = conf.get("http.robots.agents", "").split(",")[0].trim();
                if (firstRobotsAgent.isEmpty()) {
                    conf.set("http.agent.name", "*");
                } else {
                    conf.set("http.agent.name", firstRobotsAgent);
                }
            }
            super.setConf(conf);
        }

        @Override
        public BaseRobotRules getRobotRulesSet(Protocol protocol, URL url, List<Content> robotsTxtContent) {
            BaseRobotRules rules;
            if (protocol != null) {
                rules = protocol.getRobotRules(new Text(url.toString()), null, robotsTxtContent);
            } else {
                try {
                    int contentLength = url.openConnection().getContentLength();
                    byte[] robotsBytes = new byte[contentLength];
                    InputStream openStream = url.openStream();
                    openStream.read(robotsBytes);
                    openStream.close();
                    rules = robotParser.parseContent(url.toString(), robotsBytes, "text/plain", (Collection)this.agentNames);
                }
                catch (IOException e) {
                    LOG.error("Failed to open robots.txt file {}:", (Object)url, (Object)e);
                    rules = EMPTY_RULES;
                }
            }
            return rules;
        }
    }
}

