package org.lockss.crawler;

import java.io.File;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Hashtable;
import java.util.Iterator;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;
import org.lockss.config.ConfigManager;
import org.lockss.config.Configuration;
import org.lockss.daemon.ArchiveEntry;
import org.lockss.daemon.ConfigParamDescr;
import org.lockss.daemon.CrawlRule;
import org.lockss.daemon.LockssWatchdog;
import org.lockss.daemon.PluginException;
import org.lockss.extractor.ArticleMetadata;
import org.lockss.extractor.ArticleMetadataExtractor;
import org.lockss.extractor.ArticleMetadataExtractorFactory;
import org.lockss.extractor.ArticleMetadataListExtractor;
import org.lockss.extractor.BaseArticleMetadataExtractor;
import org.lockss.extractor.MetadataField;
import org.lockss.extractor.MetadataTarget;
import org.lockss.plugin.ArchivalUnit;
import org.lockss.plugin.ArticleFiles;
import org.lockss.plugin.ArticleIteratorFactory;
import org.lockss.plugin.AuUtil;
import org.lockss.plugin.CachedUrl;
import org.lockss.plugin.CachedUrlSet;
import org.lockss.plugin.ExploderHelper;
import org.lockss.plugin.Plugin;
import org.lockss.plugin.PluginManager;
import org.lockss.plugin.PluginTestUtil;
import org.lockss.plugin.SubTreeArticleIterator;
import org.lockss.plugin.exploded.ExplodedArchivalUnit;
import org.lockss.plugin.exploded.ExplodingUrlConsumerFactory;
import org.lockss.plugin.simulated.SimulatedArchivalUnit;
import org.lockss.plugin.simulated.SimulatedContentGenerator;
import org.lockss.plugin.simulated.SimulatedPlugin;
import org.lockss.test.ConfigurationUtil;
import org.lockss.test.LockssTestCase;
import org.lockss.test.MockLockssDaemon;
import org.lockss.test.NonVersionCheckingPluginManager;
import org.lockss.util.CIProperties;
import org.lockss.util.HeaderUtil;
import org.lockss.util.Logger;
import org.lockss.util.MetadataUtil;

/* loaded from: input_file:org/lockss/crawler/FuncZipExploder2.class */
public class FuncZipExploder2 extends LockssTestCase {
    private SimulatedArchivalUnit sau;
    private MockLockssDaemon theDaemon;
    PluginManager pluginMgr;
    private CrawlManagerImpl crawlMgr;
    static String[] url2;
    static final String GOOD_YEAR = "2005";
    private static final int DEFAULT_MAX_DEPTH = 1000;
    private static final int DEFAULT_FILESIZE = 3000;
    private static int fileSize;
    private static int maxDepth;
    static Logger log = Logger.getLogger();
    private static String URL_PREFIX = "http://springer.clockss.org/JOU=00109/VOL=83/ISU=12";
    static String[] url = {URL_PREFIX + "/ART=2005_719/109_2005_Article_719.xml.meta", URL_PREFIX + "/ART=2005_719/BodyRef/PDF/109_2005_Article_719.pdf", URL_PREFIX + "/ART=2005_721/109_2005_Article_721.xml.meta", URL_PREFIX + "/ART=2005_721/BodyRef/PDF/109_2005_Article_721.pdf", URL_PREFIX + "/ART=2005_724/109_2005_Article_724.xml.meta", URL_PREFIX + "/ART=2005_724/BodyRef/PDF/109_2005_Article_724.pdf"};
    private static final Set<String> doiSet = new HashSet();

    /* loaded from: input_file:org/lockss/crawler/FuncZipExploder2$MyArticleIteratorFactory.class */
    public static class MyArticleIteratorFactory implements ArticleIteratorFactory {
        Pattern pat = null;
        private static final String part1 = "/BodyRef/PDF";
        private static final String part2 = "\\.pdf";
        private static final String regex = ".*/BodyRef/PDF/.*\\.pdf";

        public Iterator<ArticleFiles> createArticleIterator(ArchivalUnit archivalUnit, MetadataTarget metadataTarget) throws PluginException {
            return new SubTreeArticleIterator(archivalUnit, new SubTreeArticleIterator.Spec().setTarget(metadataTarget)) { // from class: org.lockss.crawler.FuncZipExploder2.MyArticleIteratorFactory.1
                protected ArticleFiles createArticleFiles(CachedUrl cachedUrl) {
                    ArticleFiles articleFiles = new ArticleFiles();
                    articleFiles.setFullTextCu(cachedUrl);
                    String url = cachedUrl.getUrl();
                    if (url.matches(MyArticleIteratorFactory.regex)) {
                        CachedUrl makeCachedUrl = cachedUrl.getArchivalUnit().makeCachedUrl(url.replaceFirst(MyArticleIteratorFactory.part1, TestBaseCrawler.EMPTY_PAGE).replaceFirst(MyArticleIteratorFactory.part2, ".xml.Meta"));
                        if (makeCachedUrl == null || !makeCachedUrl.hasContent()) {
                            if (makeCachedUrl == null) {
                                FuncZipExploder2.log.debug2("xmlCu is null");
                            } else {
                                FuncZipExploder2.log.debug2(makeCachedUrl.getUrl() + " no content");
                            }
                            makeCachedUrl = cachedUrl.getArchivalUnit().makeCachedUrl(url.replaceFirst(MyArticleIteratorFactory.part1, TestBaseCrawler.EMPTY_PAGE).replaceFirst(MyArticleIteratorFactory.part2, ".xml.meta"));
                        }
                        if (makeCachedUrl != null) {
                            try {
                                if (makeCachedUrl.hasContent()) {
                                    String mimeTypeFromContentType = HeaderUtil.getMimeTypeFromContentType(makeCachedUrl.getContentType());
                                    if ("text/xml".equalsIgnoreCase(mimeTypeFromContentType)) {
                                        articleFiles.setRoleCu("xml", makeCachedUrl);
                                    } else {
                                        FuncZipExploder2.log.debug2("xml.meta wrong mime type: " + mimeTypeFromContentType + ": " + makeCachedUrl.getUrl());
                                    }
                                }
                            } finally {
                                AuUtil.safeRelease(makeCachedUrl);
                            }
                        }
                        if (makeCachedUrl == null) {
                            FuncZipExploder2.log.debug2("xmlCu is null");
                        } else {
                            FuncZipExploder2.log.debug2(makeCachedUrl.getUrl() + " no content");
                        }
                    } else {
                        FuncZipExploder2.log.debug2(url + " doesn't match " + MyArticleIteratorFactory.regex);
                    }
                    if (FuncZipExploder2.log.isDebug3()) {
                        FuncZipExploder2.log.debug3("Iter: " + articleFiles);
                    }
                    return articleFiles;
                }
            };
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/lockss/crawler/FuncZipExploder2$MyArticleMetadataExtractorFactory.class */
    public static class MyArticleMetadataExtractorFactory implements ArticleMetadataExtractorFactory {
        MyArticleMetadataExtractorFactory() {
        }

        public ArticleMetadataExtractor createArticleMetadataExtractor(MetadataTarget metadataTarget) throws PluginException {
            return new BaseArticleMetadataExtractor("xml");
        }
    }

    /* loaded from: input_file:org/lockss/crawler/FuncZipExploder2$MyCrawlRule.class */
    public static class MyCrawlRule implements CrawlRule {
        public int match(String str) {
            return str.startsWith("http://www.example.com") ? 1 : 2;
        }
    }

    /* loaded from: input_file:org/lockss/crawler/FuncZipExploder2$MyExploderHelper.class */
    public static class MyExploderHelper implements ExploderHelper {
        private static final String BASE_URL_STEM = "http://springer.clockss.org/";
        static final String[] tags = {"JOU=", "VOL=", "ISU=", "ART="};
        private static final String PUB_FLAG = "PUB=";
        private static final String PUB_NAME = "Springer";
        private static final int JOU_INDEX = 0;
        private static final int VOL_INDEX = 1;
        private static final int ISU_INDEX = 2;
        private static final int ART_INDEX = 3;
        static final int endOfBase = 0;
        static final int minimumPathLength = 4;

        public void process(ArchiveEntry archiveEntry) {
            String str = BASE_URL_STEM;
            String name = archiveEntry.getName();
            if (name.startsWith(PUB_FLAG)) {
                int indexOf = name.indexOf("/");
                if (indexOf <= 0) {
                    FuncZipExploder2.log.warning("Path " + name + " malformaeed");
                    return;
                }
                name = name.substring(indexOf + 1);
            }
            String[] split = name.split("/");
            if (split.length < 4) {
                FuncZipExploder2.log.warning("Path " + name + " too short");
                return;
            }
            for (int i = 0; i < split.length; i++) {
                FuncZipExploder2.log.debug3("pathElements[" + i + "] = " + split[i]);
            }
            for (int i2 = 0; i2 <= 0; i2++) {
                if (!split[i2].startsWith(tags[i2])) {
                    FuncZipExploder2.log.warning("Element " + i2 + " of " + name + " should be " + tags[i2]);
                    return;
                }
                str = str + split[i2] + "/";
            }
            String str2 = TestBaseCrawler.EMPTY_PAGE;
            for (int i3 = 1; i3 < split.length; i3++) {
                if (i3 <= 3 && !split[i3].startsWith(tags[i3])) {
                    FuncZipExploder2.log.warning("Element " + i3 + " of " + name + " should be " + tags[i3]);
                    return;
                }
                str2 = str2 + split[i3];
                if (i3 + 1 < split.length) {
                    str2 = str2 + "/";
                }
            }
            CIProperties syntheticHeaders = Exploder.syntheticHeaders(str + str2, archiveEntry.getSize());
            FuncZipExploder2.log.debug(name + " mapped to " + str + " plus " + str2);
            FuncZipExploder2.log.debug3(str + str2 + " props " + syntheticHeaders);
            archiveEntry.setBaseUrl(str);
            archiveEntry.setRestOfUrl(str2);
            archiveEntry.setHeaderFields(syntheticHeaders);
            if (str2.endsWith(".pdf")) {
                new Hashtable();
                String str3 = str + split[1] + "/index.html";
                String str4 = "<li><a href=\"" + str + str2 + "\">art #" + split[3].substring(4) + "</a></li>\n";
                FuncZipExploder2.log.debug3("volTOC = " + str3 + " link " + str4);
                archiveEntry.addTextTo(str3, str4);
                String str5 = str + "index.html";
                String str6 = "<li><a href=\"" + str3 + "\">vol #" + split[1].substring(4) + "</a></li>\n";
                FuncZipExploder2.log.debug3("journalTOC = " + str5 + " link " + str6);
                archiveEntry.addTextTo(str5, str6);
            } else if (str2.endsWith(".xml")) {
            }
            CIProperties cIProperties = new CIProperties();
            cIProperties.put(ConfigParamDescr.BASE_URL.getKey(), str);
            cIProperties.put(ConfigParamDescr.PUBLISHER_NAME.getKey(), PUB_NAME);
            cIProperties.put(ConfigParamDescr.JOURNAL_ISSN.getKey(), split[0].substring(4));
            cIProperties.put(ConfigParamDescr.YEAR.getKey(), split[3].substring(4, 8));
            archiveEntry.setAuProps(cIProperties);
        }

        public void setWatchdog(LockssWatchdog lockssWatchdog) {
        }

        public void pokeWDog() {
        }
    }

    /* loaded from: input_file:org/lockss/crawler/FuncZipExploder2$MySimulatedArchivalUnit.class */
    public static class MySimulatedArchivalUnit extends SimulatedArchivalUnit {
        List sbc;

        public MySimulatedArchivalUnit(Plugin plugin) {
            super(plugin);
            this.sbc = new ArrayList();
        }

        protected CrawlRule makeRules() {
            return new MyCrawlRule();
        }

        public boolean shouldBeCached(String str) {
            FuncZipExploder2.log.debug3("shouldBeCached: " + str);
            for (int i = 0; i < FuncZipExploder2.url2.length; i++) {
                if (FuncZipExploder2.url2[i].equals(str)) {
                    this.sbc.add(str);
                    return super.shouldBeCached(str);
                }
            }
            return false;
        }
    }

    /* loaded from: input_file:org/lockss/crawler/FuncZipExploder2$MySimulatedPlugin.class */
    public static class MySimulatedPlugin extends SimulatedPlugin {
        @Override // org.lockss.plugin.simulated.SimulatedPlugin
        public ArchivalUnit createAu0(Configuration configuration) throws ArchivalUnit.ConfigurationException {
            MySimulatedArchivalUnit mySimulatedArchivalUnit = new MySimulatedArchivalUnit(this);
            mySimulatedArchivalUnit.setConfiguration(configuration);
            return mySimulatedArchivalUnit;
        }
    }

    public static void main(String[] strArr) throws Exception {
        FuncZipExploder2 funcZipExploder2 = new FuncZipExploder2();
        if (strArr.length > 0) {
            try {
                maxDepth = Integer.parseInt(strArr[0]);
            } catch (NumberFormatException e) {
            }
        }
        log.info("Setting up for depth " + maxDepth);
        funcZipExploder2.setUp(maxDepth);
        log.info("Running up for depth " + maxDepth);
        funcZipExploder2.testRunSelf();
        funcZipExploder2.tearDown();
    }

    @Override // org.lockss.test.LockssTestCase
    public void setUp() throws Exception {
        super.setUp();
        setUp(DEFAULT_MAX_DEPTH);
    }

    public void setUp(int i) throws Exception {
        String str = getTempDir().getAbsolutePath() + File.separator;
        Properties properties = new Properties();
        properties.setProperty("org.lockss.crawler.maxCrawlDepth", TestBaseCrawler.EMPTY_PAGE + i);
        maxDepth = i;
        properties.setProperty("org.lockss.platform.diskSpacePaths", str);
        properties.setProperty("org.lockss.plugin.simulated.SimulatedContentGenerator.doZipFile", "true");
        properties.setProperty("org.lockss.plugin.simulated.SimulatedContentGenerator.actualZipFile", "true");
        properties.setProperty("org.lockss.crawler.storeArchives", "true");
        properties.setProperty("org.lockss.crawler.exploder.explodedPluginName", "org.lockss.crawler.FuncZipExploder2MockExplodedPlugin");
        ConfigurationUtil.addFromProps(properties);
        this.theDaemon = getMockLockssDaemon();
        this.theDaemon.getAlertManager();
        this.theDaemon.setUpAuConfig();
        this.pluginMgr = new NonVersionCheckingPluginManager();
        this.pluginMgr.initService(this.theDaemon);
        this.theDaemon.setPluginManager(this.pluginMgr);
        this.crawlMgr = new NoPauseCrawlManagerImpl();
        this.theDaemon.setCrawlManager(this.crawlMgr);
        this.crawlMgr.initService(this.theDaemon);
        this.theDaemon.getRepositoryManager().startService();
        this.theDaemon.suppressStartAuManagers(false);
        this.theDaemon.setDaemonInited(true);
        this.pluginMgr.startService();
        this.pluginMgr.startLoadablePlugins();
        PluginManager pluginManager = this.pluginMgr;
        this.pluginMgr.ensurePluginLoaded(PluginManager.pluginKeyFromName("org.lockss.crawler.FuncZipExploder2MockExplodedPlugin"));
        this.sau = PluginTestUtil.createAndStartSimAu(MySimulatedPlugin.class, simAuConfig(str));
        this.sau.setUrlConsumerFactory(new ExplodingUrlConsumerFactory());
    }

    @Override // org.lockss.test.LockssTestCase
    public void tearDown() throws Exception {
        this.theDaemon.stopDaemon();
        super.tearDown();
    }

    Configuration simAuConfig(String str) {
        Configuration newConfiguration = ConfigManager.newConfiguration();
        newConfiguration.put("root", str);
        newConfiguration.put("depth", "3");
        newConfiguration.put(SimulatedContentGenerator.BRANCH_PREFIX, "1");
        newConfiguration.put("numFiles", "2");
        newConfiguration.put("fileTypes", "16");
        newConfiguration.put("binFileSize", TestBaseCrawler.EMPTY_PAGE + fileSize);
        return newConfiguration;
    }

    public void testRunSelf() throws Exception {
        log.debug3("About to create content");
        createContent();
        String simRoot = this.sau.getSimRoot();
        log.debug3("About to crawl content");
        crawlContent();
        CachedUrlSet auCachedUrlSet = this.sau.getAuCachedUrlSet();
        File file = new File(simRoot);
        if (file.isDirectory()) {
            File[] listFiles = file.listFiles();
            log.debug("Checking simulated content.");
            checkThruFileTree(listFiles, auCachedUrlSet);
            log.debug("Checking simulated content done.");
            checkUnExplodedUrls();
            checkExplodedUrls();
            log.debug("Check finished.");
        } else {
            log.error("Error: The root path of the simulated content [" + file + "] is not a directory");
        }
        long auContentSize = AuUtil.getAuContentSize(this.sau, true);
        long j = 285227 - auContentSize;
        assertTrue("size mismatch 285227 vs. " + auContentSize, ((j > 0L ? 1 : (j == 0L ? 0 : -1)) < 0 ? -j : j) < 60);
        checkDOIs();
    }

    private void checkDOIs() {
        List allAus = this.theDaemon.getPluginManager().getAllAus();
        for (int i = 0; i < allAus.size(); i++) {
            ArchivalUnit archivalUnit = (ArchivalUnit) allAus.get(i);
            assertNotNull(archivalUnit);
            log.debug("AU " + i + " : " + archivalUnit);
            MockExplodedPlugin plugin = archivalUnit.getPlugin();
            assertNotNull(plugin);
            log.debug("Exploded Plugin: " + plugin);
            if (plugin instanceof MockExplodedPlugin) {
                MockExplodedPlugin mockExplodedPlugin = plugin;
                mockExplodedPlugin.setDefaultArticleMimeType("application/pdf");
                mockExplodedPlugin.setArticleIteratorFactory(new MyArticleIteratorFactory());
                mockExplodedPlugin.setArticleMetadataExtractorFactory(new MyArticleMetadataExtractorFactory());
                ArticleMetadataExtractor articleMetadataExtractor = plugin.getArticleMetadataExtractor(MetadataTarget.Any(), archivalUnit);
                assertNotNull(articleMetadataExtractor);
                ArticleMetadataListExtractor articleMetadataListExtractor = new ArticleMetadataListExtractor(articleMetadataExtractor);
                int i2 = 0;
                HashSet hashSet = new HashSet();
                Iterator articleIterator = archivalUnit.getArticleIterator();
                while (articleIterator.hasNext()) {
                    ArticleFiles articleFiles = (ArticleFiles) articleIterator.next();
                    CachedUrl fullTextCu = articleFiles.getFullTextCu();
                    assertNotNull(fullTextCu);
                    String contentType = fullTextCu.getContentType();
                    assertNotNull(contentType);
                    assertTrue(contentType.toLowerCase().startsWith("application/pdf"));
                    log.debug("count " + i2 + " url " + fullTextCu.getUrl() + " " + contentType);
                    i2++;
                    try {
                        List<ArticleMetadata> extract = articleMetadataListExtractor.extract(MetadataTarget.Any(), articleFiles);
                        assertNotEmpty(extract);
                        ArticleMetadata articleMetadata = extract.get(0);
                        assertNotNull(articleMetadata);
                        String str = articleMetadata.get(MetadataField.FIELD_DOI);
                        log.debug(fullTextCu.getUrl() + " doi " + str);
                        assertTrue(MetadataUtil.isDoi(str));
                        hashSet.add(str);
                    } catch (Exception e) {
                        fail(e.toString());
                    }
                }
                log.debug("Article count is " + i2);
                assertEquals(doiSet.size(), i2);
                assertEquals(doiSet, hashSet);
            }
        }
    }

    private void checkThruFileTree(File[] fileArr, CachedUrlSet cachedUrlSet) {
        for (int i = 0; i < fileArr.length; i++) {
            log.debug3("Check: " + fileArr[i].getAbsolutePath());
            if (fileArr[i].isDirectory()) {
                checkThruFileTree(fileArr[i].listFiles(), cachedUrlSet);
            } else {
                String mapContentFileNameToUrl = this.sau.mapContentFileNameToUrl(fileArr[i].getAbsolutePath());
                int linkDepth = this.sau.getLinkDepth(mapContentFileNameToUrl);
                log.debug2("File: " + mapContentFileNameToUrl + " in Level " + linkDepth);
                CachedUrl findCachedUrl = this.theDaemon.getPluginManager().findCachedUrl(mapContentFileNameToUrl);
                if (linkDepth <= maxDepth) {
                    assertNotNull("Can't find CU for " + mapContentFileNameToUrl, findCachedUrl);
                    assertTrue(findCachedUrl + " has no content", findCachedUrl.hasContent());
                } else {
                    assertFalse(findCachedUrl + " has content when it shouldn't", findCachedUrl.hasContent());
                }
            }
        }
    }

    private void checkExplodedUrls() {
        log.debug2("Checking Exploded URLs.");
        for (int i = 0; i < url.length; i++) {
            CachedUrl findCachedUrl = this.theDaemon.getPluginManager().findCachedUrl(url[i]);
            assertTrue(url[i] + " not in any AU", findCachedUrl != null);
            ArchivalUnit archivalUnit = findCachedUrl.getArchivalUnit();
            log.debug2("Check: " + url[i] + " cu " + findCachedUrl + " au " + archivalUnit.getAuId());
            assertTrue(findCachedUrl + " has no content", findCachedUrl.hasContent());
            assertTrue(findCachedUrl + " isn't ExplodedArchivalUnit", archivalUnit instanceof ExplodedArchivalUnit);
            assertNotEquals(this.sau, archivalUnit);
            Configuration configuration = archivalUnit.getConfiguration();
            log.debug3(findCachedUrl + " config " + configuration);
            assertEquals(findCachedUrl + " wrong year", GOOD_YEAR, configuration.get(ConfigParamDescr.YEAR.getKey()));
        }
        log.debug2("Checking Exploded URLs done.");
    }

    private void checkUnExplodedUrls() {
        log.debug2("Checking UnExploded URLs.");
        for (int i = 0; i < url2.length; i++) {
            CachedUrl findCachedUrl = this.theDaemon.getPluginManager().findCachedUrl(url2[i]);
            assertTrue(url2[i] + " not in any AU", findCachedUrl != null);
            log.debug2("Check: " + url2[i] + " cu " + findCachedUrl + " au " + findCachedUrl.getArchivalUnit().getAuId());
            assertTrue(findCachedUrl + " has no content", findCachedUrl.hasContent());
            assertTrue(findCachedUrl + " isn't MySimulatedArchivalUnit", !(findCachedUrl instanceof MySimulatedArchivalUnit));
            assertEquals(this.sau, findCachedUrl.getArchivalUnit());
        }
        log.debug2("Checking UnExploded URLs done.");
    }

    private void createContent() {
        log.debug("Generating tree of size 3x1x2 with " + fileSize + "byte files...");
        this.sau.generateContentTree();
    }

    private void crawlContent() {
        log.debug("Crawling tree...");
        this.sau.setStartUrls(this.sau.getStartUrls());
        this.sau.setRule(new MyCrawlRule());
        this.sau.setExploderPattern(".zip$");
        this.sau.setExploderHelper(new MyExploderHelper());
        FollowLinkCrawler followLinkCrawler = new FollowLinkCrawler(this.sau, AuUtil.getAuState(this.sau));
        followLinkCrawler.setCrawlManager(this.crawlMgr);
        followLinkCrawler.doCrawl();
    }

    static {
        doiSet.add("10.1007/s00109-005-0721-x");
        doiSet.add("10.1007/s00109-005-0724-7");
        doiSet.add("10.1007/s00109-005-0719-4");
        url2 = new String[]{"http://www.example.com/index.html", "http://www.example.com/SpringerSample.zip", "http://www.example.com/001file.bin", "http://www.example.com/002file.bin", "http://www.example.com/branch1/001file.bin", "http://www.example.com/branch1/002file.bin", "http://www.example.com/branch1/branch1/001file.bin", "http://www.example.com/branch1/branch1/002file.bin", "http://www.example.com/branch1/branch1/branch1/001file.bin", "http://www.example.com/branch1/branch1/branch1/002file.bin", "http://www.example.com/branch1/branch1/branch1/index.html", "http://www.example.com/branch1/branch1/index.html", "http://www.example.com/branch1/index.html"};
        fileSize = DEFAULT_FILESIZE;
        maxDepth = DEFAULT_MAX_DEPTH;
    }
}
