package org.lockss.extractor;

import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Scanner;
import java.util.Set;
import org.lockss.crawler.TestBaseCrawler;
import org.lockss.extractor.LinkExtractor;
import org.lockss.plugin.ArchivalUnit;
import org.lockss.test.LockssTestCase;
import org.lockss.test.MockArchivalUnit;
import org.lockss.test.MockCachedUrl;
import org.lockss.test.StringInputStream;
import org.lockss.util.ListUtil;
import org.lockss.util.SetUtil;
import org.lockss.util.TypedEntryMap;
import org.lockss.util.UrlUtil;

/* loaded from: input_file:org/lockss/extractor/TestJsoupHtmlLinkExtractor.class */
public class TestJsoupHtmlLinkExtractor extends LockssTestCase {
    public static final String startUrl = "http://www.example.com/index.html";
    private MockArchivalUnit m_mau;
    private JsoupHtmlLinkExtractor m_extractor;
    private MyLinkExtractorCallback m_callback;
    private static final String HTTP = "http";
    private static final String HEADER = "header";
    private static final String CONTENT = "content";
    private static final String END_OF_INPUT = "\\Z";
    static String ENC = "ISO-8859-1";
    private static final String NEWLINE = System.getProperty("line.separator");

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/lockss/extractor/TestJsoupHtmlLinkExtractor$MyLinkExtractorCallback.class */
    public static class MyLinkExtractorCallback implements LinkExtractor.Callback {
        Set<String> foundUrls;

        private MyLinkExtractorCallback() {
            this.foundUrls = new HashSet();
        }

        public void foundLink(String str) {
            this.foundUrls.add(str);
        }

        public Set<String> getFoundUrls() {
            return this.foundUrls;
        }

        public void reset() {
            this.foundUrls = new HashSet();
        }
    }

    @Override // org.lockss.test.LockssTestCase
    public void setUp() throws Exception {
        super.setUp();
        this.m_mau = new MockArchivalUnit();
        this.m_extractor = new JsoupHtmlLinkExtractor(false, true, (Map) null, (Map) null);
        this.m_callback = new MyLinkExtractorCallback();
    }

    public void testExtractUrls() throws Exception {
    }

    public void testRegisterTagExtractor() throws Exception {
    }

    public void xxxtestCharsetChange() throws Exception {
        URL url = new URL("http://www.pensoft.net/journals/neobiota/issue/11/");
        MyLinkExtractorCallback myLinkExtractorCallback = new MyLinkExtractorCallback();
        ListUtil.fromArray(new String[]{"http://www.pensoft.net/journals/neobiota/issue/12/", "http://www.pensoft.net/journals/neobiota/issue/13/", "http://www.pensoft.net/journals/neobiota/issue/14/", "http://www.pensoft.net/journals/neobiota/issue/15/"});
        this.m_extractor.extractUrls(this.m_mau, url.openStream(), ENC, "http://www.pensoft.net/journals/neobiota/issue/11/", myLinkExtractorCallback);
        for (String str : myLinkExtractorCallback.getFoundUrls()) {
            if (str.contains("article")) {
                try {
                    UrlUtil.openInputStream(str);
                } catch (IOException e) {
                    fail("crawl of url: " + str + ":" + e.getMessage());
                }
            }
        }
    }

    public void testThrowsOnNullInputStream() throws Exception {
        try {
            this.m_extractor.extractUrls(this.m_mau, (InputStream) null, ENC, "http://www.example.com/", new MyLinkExtractorCallback());
            fail("Calling extractUrls with a null InputStream should have thrown");
        } catch (IllegalArgumentException e) {
        }
    }

    public void testThrowsOnNullSourceUrl() throws Exception {
        StringInputStream stringInputStream = null;
        try {
            stringInputStream = new StringInputStream("Blah");
            this.m_extractor.extractUrls(this.m_mau, stringInputStream, ENC, (String) null, new MyLinkExtractorCallback());
            fail("Calling extractUrls with a null CachedUrl should have thrown");
            if (stringInputStream != null) {
                stringInputStream.close();
            }
        } catch (IllegalArgumentException e) {
            if (stringInputStream != null) {
                stringInputStream.close();
            }
        } catch (Throwable th) {
            if (stringInputStream != null) {
                stringInputStream.close();
            }
            throw th;
        }
    }

    public void testThrowsOnNullCallback() throws Exception {
        StringInputStream stringInputStream = null;
        try {
            stringInputStream = new StringInputStream("blah");
            this.m_extractor.extractUrls(this.m_mau, stringInputStream, ENC, "http://www.example.com/", (LinkExtractor.Callback) null);
            fail("Calling extractUrls with a null callback should have thrown");
            if (stringInputStream != null) {
                stringInputStream.close();
            }
        } catch (IllegalArgumentException e) {
            if (stringInputStream != null) {
                stringInputStream.close();
            }
        } catch (Throwable th) {
            if (stringInputStream != null) {
                stringInputStream.close();
            }
            throw th;
        }
    }

    public void testParsesHref() throws Exception {
        singleTagShouldParse("http://www.example.com/web_link.html", "<a href=", "</a>");
    }

    public void testParsesHrefWithTab() throws Exception {
        singleTagShouldParse("http://www.example.com/web_link.html", "<a\thref=", "</a>");
    }

    public void testParsesHrefWithCarriageReturn() throws Exception {
        singleTagShouldParse("http://www.example.com/web_link.html", "<a\rhref=", "</a>");
    }

    public void testParsesHrefWithNewLine() throws Exception {
        singleTagShouldParse("http://www.example.com/web_link.html", "<a\nhref=", "</a>");
    }

    public void testParsesImage() throws Exception {
        singleTagShouldParse("http://www.example.com/web_link.jpg", "<img src=", "</img>");
        singleTagShouldParse("http://www.example.com/web_link.jpg", "<img\nwidth='280' hight='90' src=", "</img>");
    }

    public void testParsesImageData() throws Exception {
        singleTagShouldNotParse("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==", "<img src=", "</img>");
        assertEquals(SetUtil.set(new Object[0]), parseSingleSource("<img src=\"data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==\" alt=\"Red dot\" />"));
    }

    public void testParsesDataUri() throws Exception {
        String str = "<html><head><title>Test</title></head><body><a href=\"data:text/html;charset=utf-8,%3Ca+href%3D%22http%3A%2F%2Fwww.example.com%2Flink3.html%22%3Elink3%3C%2Fa%3E\">link3</a>";
        MockArchivalUnit mockArchivalUnit = new MockArchivalUnit();
        mockArchivalUnit.setLinkExtractor("text/html", new JsoupHtmlLinkExtractor(false, false, (Map) null, (Map) null));
        new MockCachedUrl("http://www.example.com", mockArchivalUnit).setContent(str);
        this.m_callback.reset();
        this.m_extractor.extractUrls(mockArchivalUnit, new StringInputStream(str), ENC, "http://www.example.com", this.m_callback);
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/link3.html"}), this.m_callback.getFoundUrls());
    }

    public void testParsesEmbed() throws Exception {
        singleTagShouldParse("http://www.example.com/web_link.jpg", "<embed src=", "</embed>");
    }

    public void testParsesApplet() throws Exception {
        singleTagShouldParse("http://www.example.com/web_link.jpg", "<applet code=", "</applet>");
    }

    public void testParsesArea() throws Exception {
        singleTagShouldParse("http://www.example.com/web_link.shtml", "<area href=", "</area>");
        singleTagShouldParse("http://www.example.com/web_link.shtml", "<area shape='rect' coords='279,481,487' href=", "</area>");
    }

    public void testParsesObject() throws Exception {
        singleTagShouldParse("http://www.example.com/web_link.jpg", "<object codebase=", "</object>");
    }

    public void testParsesOptionPositive() throws Exception {
        TypedEntryMap typedEntryMap = new TypedEntryMap();
        typedEntryMap.setMapElement("html-parser-select-attrs", ListUtil.list(new String[]{"value"}));
        this.m_mau.setPropertyMap(typedEntryMap);
        singleTagShouldNotParse("http://www.example.com/web_link.jpg", "<option  value=", "</option>", this.m_mau);
        singleTagShouldNotParse("http://www.example.com/web_link.jpg", "<option a=b value=", "</option>", this.m_mau);
    }

    public void testParsesOptionNegative() throws Exception {
        singleTagShouldNotParse("http://www.example.com/web_link.jpg", "<option  value=", "</option>");
        singleTagShouldNotParse("http://www.example.com/web_link.jpg", "<option a=b value=", "</option>");
    }

    public void testDoCrawlImageWithSrcInAltTag() throws Exception {
        singleTagShouldParse("http://www.example.com/web_link.jpg", "<img alt=src src=", "</img>");
        singleTagShouldParse("http://www.example.com/web_link.jpg", "<img alt = src src=", "</img>");
    }

    public void testDoCrawlImageWithSrcInAltTagAfterSrcProper() throws Exception {
        String str = "<html><head><title>Test</title></head><body><img src=http://www.example.com/link3.html alt=src>link3</a>";
        new MockCachedUrl("http://www.example.com/index.html").setContent(str);
        this.m_extractor.extractUrls(this.m_mau, new StringInputStream(str), ENC, "http://www.example.com/index.html", this.m_callback);
        HashSet hashSet = new HashSet();
        hashSet.add("http://www.example.com/link3.html");
        assertEquals(hashSet, this.m_callback.getFoundUrls());
    }

    public void testDoCrawlFrame() throws Exception {
        assertIsomorphic(SetUtil.set(new String[]{"http://www.example.com/menu.html", "http://www.example.com/content.html"}), parseSingleSource("<html><head></head><frameset><frame src=\"http://www.example.com/menu.html\"></frame><frame src=\"http://www.example.com/content.html\"></frameset></html>"));
    }

    public void testAnchorTagWithClass() throws Exception {
        assertEmpty(parseSingleSource("<html><head><title>Test Title</title></head><body><div class=\"holder\"><a title=\"Open Figure Viewer\" onclick=\"showFigures(this,event); return false;\" href=\"JavaScript:void(0);\" class=\"thumbnail\"></a></div></body></html>"));
    }

    public void testDoCrawlLink() throws Exception {
        singleTagShouldParse("http://www.example.com/web_link.css", "<link href=", "</link>");
        singleTagShouldParse("http://www.example.com/web_link.css", "<link rel=\"stylesheet\" type=\"text/css\" media=\"screen\"  href=", "</link>");
    }

    public void testDoCrawlStyleAbsolute() throws Exception {
        performDoCrawlStyle("<style>", "http://www.example.com/", "http://www.example.com/");
    }

    public void testDoCrawlStyleRelative() throws Exception {
        performDoCrawlStyle("<style>", TestBaseCrawler.EMPTY_PAGE, "http://www.example.com/");
    }

    public void testDoCrawlStyleWithTypeAttributeAbsolute() throws Exception {
        performDoCrawlStyle("<style type=\"text/css\">", "http://www.example.com/", "http://www.example.com/");
    }

    public void testDoCrawlStyleWithTypeAttributeRelative() throws Exception {
        performDoCrawlStyle("<style type=\"text/css\">", TestBaseCrawler.EMPTY_PAGE, "http://www.example.com/");
    }

    public void testDoCrawlStyleAbsoluteShort() throws Exception {
        performDoCrawlStyleShort("<style>", "http://www.example.com/", "http://www.example.com/");
    }

    protected void performDoCrawlStyle(String str, String str2, String str3) throws Exception {
        assertEquals(SetUtil.set(new String[]{str3 + "foo1.css", str3 + "foo2.css", str3 + "foo3.css", str3 + "foo4.css", str3 + "img5.gif", str3 + "img6.gif"}), parseSingleSource("<html>\n <head>\n  <title>Test</title>\n  " + str + "\n<!--\n@import url('" + str2 + "foo1.css');\n@import url(\"" + str2 + "foo2.css\");\n@import '" + str2 + "foo3.css';\n@import \"" + str2 + "foo4.css\";\nfoo {\n bar: url('" + str2 + "img5.gif');\n baz: url(\"" + str2 + "img6.gif\");\n}\n/* Comment */-->\n  </style>\n </head>\n <body>\n  <p>Fake content</p>\n </body>\n</html>\n"));
    }

    protected void performDoCrawlStyleShort(String str, String str2, String str3) throws Exception {
        assertEquals(SetUtil.set(new String[]{str3 + "foo3333.css"}), parseSingleSource("<html>\n <head>\n  <title>Test</title>\n  " + str + "\n<!--\n@import '" + str2 + "foo3333.css';\n}\n/* Comment */-->\n  </style>\n </head>\n <body>\n  <p>Fake content</p>\n </body>\n</html>\n"));
    }

    public void testDoCrawlBody() throws Exception {
        singleTagShouldParse("http://www.example.com/web_link.jpg", "<body background=", "</body>");
    }

    public void testDoCrawlTable() throws Exception {
        singleTagShouldParse("http://www.example.com/web_link.jpg", "<table background=", "</table>");
    }

    public void testDoCrawlTd() throws Exception {
        singleTagShouldParse("http://www.example.com/web_link.jpg", "<table> <td background=", "</td></table>");
    }

    public void testDoCrawlTh() throws Exception {
        singleTagShouldParse("http://www.example.com/web_link.jpg", "<table><th background=", "</th></table>");
    }

    public void testDoCrawlScript() throws Exception {
        singleTagShouldParse("http://www.example.com/web_link.jpg", "<script src=", "</script>");
    }

    public void testDoCrawlWithEqualsInUrl() throws Exception {
        singleTagShouldParse("http://www.example.com/acs/a/toc.select?in_coden=jcisd8&in_volume=43", "<a href=", "</a>");
    }

    public void testDoCrawlWithLineBreakBeforeTag() throws Exception {
        singleTagShouldParse("http://www.example.com/web_link.html", "<a\nhref=", "</a");
    }

    private void singleTagShouldParse(String str, String str2, String str3) throws Exception {
        singleTagShouldParse(str, str2, str3, null);
    }

    private void singleTagShouldParse(String str, String str2, String str3, ArchivalUnit archivalUnit) throws Exception {
        singleTagParse(str, str2, str3, archivalUnit, true);
    }

    private void singleTagShouldNotParse(String str, String str2, String str3) throws Exception {
        singleTagShouldNotParse(str, str2, str3, null);
    }

    private void singleTagShouldNotParse(String str, String str2, String str3, ArchivalUnit archivalUnit) throws Exception {
        singleTagParse(str, str2, str3, archivalUnit, false);
    }

    private void singleTagParse(String str, String str2, String str3, ArchivalUnit archivalUnit, boolean z) throws Exception {
        MockCachedUrl mockCachedUrl = new MockCachedUrl("http://www.example.com");
        String makeContent = makeContent(str, str2, str3);
        mockCachedUrl.setContent(makeContent);
        MyLinkExtractorCallback myLinkExtractorCallback = new MyLinkExtractorCallback();
        this.m_extractor.extractUrls(this.m_mau, new StringInputStream(makeContent), ENC, "http://www.example.com", myLinkExtractorCallback);
        if (!z) {
            assertEquals("Misparsed: " + makeContent, new HashSet(), myLinkExtractorCallback.getFoundUrls());
        } else {
            HashSet hashSet = new HashSet();
            hashSet.add(str);
            assertEquals("Misparsed: " + makeContent, hashSet, myLinkExtractorCallback.getFoundUrls());
        }
    }

    public void testDoNotCrawlBadA() throws Exception {
        checkBadTags(new String[]{"<a harf=", "<a hre=", "<a hrefe=", "<al href="}, "</a>");
    }

    public void testDoNotCrawlBadFrameTag() throws Exception {
        checkBadTags(new String[]{"<fram src=", "<framea src=", "<framr src=", "<frame sr=", "<frame srcr=", "<frame sra="}, "</frame>");
    }

    public void testDoNotCrawlBadImgTag() throws Exception {
        checkBadTags(new String[]{"<im src=", "<imga src=", "<ime src=", "<img sr=", "<img srcr=", "<img sra="}, "</frame>");
    }

    public void testDoNotCrawlBadLinkTag() throws Exception {
        checkBadTags(new String[]{"<lin href=", "<linkf href=", "<lino href=", "<link hre=", "<link hrefr=", "<link hrep="}, "</link>");
    }

    public void testDoNotCrawlBadBodyTag() throws Exception {
        checkBadTags(new String[]{"<bod background=", "<bodyk background=", "<bodp background=", "<body backgroun=", "<body backgrounyl=", "<body backgrounj="}, "</body>");
    }

    public void testDoNotCrawlBadScriptTag() throws Exception {
        checkBadTags(new String[]{"<scrip src=", "<scriptl src=", "<scripo src=", "<script sr=", "<script srcu=", "<script srp="}, "</script>");
    }

    public void testDoNotCrawlBadTableTag() throws Exception {
        checkBadTags(new String[]{"<tabl background=", "<tablea background=", "<tablu background=", "<table backgroun=", "<table backgroundl=", "<table backgrouno="}, "</table>");
    }

    public void testDoNotCrawlBadTdTag() throws Exception {
        checkBadTags(new String[]{"<t background=", "<tdl background=", "<ta background=", "<td backgroun=", "<td backgroundl=", "<td backgrouno="}, "</td>");
    }

    public void testDoNotCrawlBadThTag() throws Exception {
        checkBadTags(new String[]{"<t background=", "<thl background=", "<ta background=", "<th backgroun=", "<th backgroundl=", "<th backgrouno="}, "</th>");
    }

    public void testEmptyAttribute() throws Exception {
        assertEquals(SetUtil.set(new Object[0]), parseSingleSource("<html><head><title>Test</title></head><body><a href=>link3</a>"));
    }

    public void testStyleAttribute() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/backg.png"}), parseSingleSource("<html><head><title>Test</title></head><body><span class=\"foo\" style=\"background: url('/backg.png') no-repeat 0px -64px;\" />"));
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/link3.html", "http://www.example.com/backg.png"}), parseSingleSource("<html><head><title>Test</title></head><body><a href=\"http://www.example.com/link3.html\" style=\"background: url('/backg.png');\">link3</a>"));
    }

    public void testParseUnknownProtocol() throws Exception {
        assertEmpty(parseSingleSource("<html><head><title>Test</title></head><body><a href=\"badprotocol://www.example.com/link3.html\">link3</a>"));
    }

    public void testParsesFileWithQuotedUrls() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/link3.html"}), parseSingleSource("<html><head><title>Test</title></head><body><a href=\"http://www.example.com/link3.html\">link3</a>"));
    }

    public void testDontParseJSByDefault() throws Exception {
        assertEquals(SetUtil.set(new Object[0]), parseSingleSource("<html><head><title>Test</title></head><body><a href = javascript:newWindow('http://www.example.com/link3.html')</a><a href = javascript:popup('http://www.example.com/link2.html')</a><img src = javascript:popup('http://www.example.com/link1.html') </img></body></html>"));
    }

    public void testDontParseMailto() throws Exception {
        assertEquals(SetUtil.set(new Object[0]), parseSingleSource("<html><head><title>Test</title></head><body><a href = mailto:user@example.com</a>"));
    }

    public void testParseHWPDF() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/cgi/reprint/21/1/2.pdf"}), parseSingleSource("<table cellspacing=\"0\" cellpadding=\"10\" width=\"250\" border=\"0\"><tr><td align=center bgcolor=\"#DBDBDB\">\n\n\t<font face=\"verdana,arial,helvetica,sans-serif\"><strong><font size=+1>Automatic download</font><br>\n\t<font size=\"-1\">[<a target=\"_self\" href=\"/cgi/reprint/21/1/2.pdf\" onclick=\"cancelLoadPDF()\">Begin manual download</a>]</strong></font>\n"));
    }

    public void testResolvesHtmlEntities() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/bioone/?request=get-toc&issn=0044-7447&volume=32&issue=1"}), parseSingleSource("<html><head><title>Test</title></head><body><a href=http://www.example.com/bioone/?request=get-toc&#38;issn=0044-7447&#38;volume=32&issue=1>link1</a>"));
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/xxx"}), parseSingleSource("<html><head><title>Test</title></head><body><base href=http://www.example.com/foo/bar><a href=&#46&#46/xxx>link1</a>"));
    }

    public void testInterpretsBaseTag() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/link1.html", "http://www.example.com/link2.html", "http://www.example.com/link3.html"}), parseSingleSource("<html><head><title>Test</title></head><body><base href=http://www.example.com><a href=link1.html>link1</a>Filler, with <b>bold</b> tags and<i>others</i><base href=http://www.example2.com><a href=link2.html>link2</a><base href=http://www.example3.com><a href=link3.html>link3</a>"));
    }

    public void testInterpretsMalformedBaseTag() throws Exception {
        assertIsomorphic(SetUtil.set(new String[]{"http://www.example.com/link1.html", "http://www.example.com/link2.html", "http://www.example2.com/link3.html", "http://www.example.com/link3.html"}), parseSingleSource("<html><head><title>Test</title></head><body><base href=http://www.example.com><a href=link1.html>link1</a>Filler, with <b>bold</b> tags and<i>others</i><base href=javascript:www.example2.com><a href=link2.html>link2</a><base href=www.example.com><a href=http://www.example2.com/link3.html>link3</a><base href=http://www.example3.com><a href=link3.html>link4</a>"));
    }

    public void testIgnoresNullHrefInBaseTag() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/link1.html", "http://www.example.com/link2.html", "http://www.example.com/link3.html"}), parseSingleSource("<html><head><title>Test</title></head><body><a href=link1.html>link1</a>Filler, with <b>bold</b> tags and<i>others</i><base blah=blah><a href=link2.html>link2</a><a href=link3.html>link3</a>"));
    }

    public void testIgnoresEmptyHrefInBaseTag() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/link1.html", "http://www.example.com/link2.html", "http://www.example.com/link3.html"}), parseSingleSource("<html><head><title>Test</title></head><body><a href=link1.html>link1</a>Filler, with <b>bold</b> tags and<i>others</i><base href=\"\" blah=blah><a href=link2.html>link2</a><a href=link3.html>link3</a>"));
    }

    public void testSkipsComments() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/link3.html"}), parseSingleSource("<html><head><title>Test</title></head><body><!--<a href=http://www.example.com/link1.html>link1</a>Filler, with <b>bold</b> tags and<i>others</i><a href=http://www.example.com/link2.html>link2</a>--><a href=http://www.example.com/link3.html>link3</a>"));
    }

    public void testSkipsMalformedComments() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/link3.html"}), parseSingleSource("<html><head><title>Test</title></head><body><!--<a href=http://www.example.com/link1.html>link1</a>Filler, with <b>bold</b> tags and<i>others</i><a href=http://www.example.com/link2.html>link2</a>--!><a href=http://www.example.com/link3.html>link3</a>"));
    }

    public void testSkipsScriptTags() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/link3.html"}), parseSingleSource("<html><head><title>Test</title></head><body><script><a href=http://www.example.com/link1.html>link1</a>Filler, with <b>bold</b> tags and<i>others</i><a href=http://www.example.com/link2.html>link2</a></script><a href=http://www.example.com/link3.html>link3</a></body></html>"));
    }

    public void testSkipsScriptTagsAllTheWay() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/link3.html"}), parseSingleSource("<html><head><title>Test</title></head><body><script><a href=http://www.example.com/link1.html>link1</a>Filler, with <b>bold</b> tags and<i>others</i><a href=http://www.example.com/link2.html</script><a href=http://www.example.com/link3.html>link3</a></body></html>"));
    }

    private void doScriptSkipTest(String str, String str2) throws Exception {
        doScriptSkipTest(str, str2, null);
    }

    private void doScriptSkipTest(String str, String str2, String str3) throws Exception {
        assertEquals(str3, SetUtil.set(new String[]{"http://www.example.com/link3.html"}), parseSingleSource("<html><head><title>Test</title></head><body>" + str + "<a href=http://www.example.com/link1.html>link1</a>Filler, with <b>bold</b> tags and<i>others</i><a href=http://www.example.com/link2.html>link2</a>" + str2 + "<a href=http://www.example.com/link3.html>link3</a>"));
    }

    public void testSkipsScriptTagsIgnoreCase() throws Exception {
        doScriptSkipTest("<ScRipt>", "</sCripT>");
    }

    public void testKeepsSpaceInUrl() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/link%20with%20space.html"}), parseSingleSource("<html><head><title>Test</title></head><body><a href=\"http://www.example.com/link with space.html\">Link</a>"));
    }

    public void testIgnoresNewLineInUrl() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/linkwithspace.html"}), parseSingleSource("<html><head><title>Test</title></head><body><a href=\"http://www.example.com/link\nwith\nspace.html\">Link</a>"));
    }

    public void testIgnoresNewLineInField() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/link.html"}), parseSingleSource("<html><head><title>Test</title></head><body><img\nsrc=\"http://www.example.com/link.html\">Link</a>"));
    }

    public void testIgnoresCRInUrl() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/linkwithspace.html"}), parseSingleSource("<html><head><title>Test</title></head><body><a href=\"http://www.example.com/link\rwith\rspace.html\">Link</a>"));
    }

    public void testKeepsDoubleQuoteInUrl() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/link%22with%22quotes.html"}), parseSingleSource("<html><head><title>Test</title></head><body><a href='http://www.example.com/link\"with\"quotes.html'>Link</a>"));
    }

    public void testKeepsSingleQuoteInUrl() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/link'with'quotes.html"}), parseSingleSource("<html><head><title>Test</title></head><body><a href=\"http://www.example.com/link'with'quotes.html\">Link</a>"));
    }

    public void testMultipleLinks() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/link1.html", "http://www.example.com/link2.html", "http://www.example.com/link3.html"}), parseSingleSource("<html><head><title>Test</title></head><body><a href=http://www.example.com/link1.html>link1</a>Filler, with <b>bold</b> tags and<i>others</i><a href=http://www.example.com/link2.html>link2</a><a href=http://www.example.com/link3.html>link3</a>"));
    }

    public void testRelativeLinksLocationTagsAndMultipleKeys() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://www.example.com/link1.html", "http://www.example.com/link2.html#ref", "http://www.example.com/dir/link3.html"}), parseSingleSource("<html><head><title>Test</title></head><body><a href=link1.html>link1</a>Filler, with <b>bold</b> tags and<i>others</i><a blah1=blah href=link2.html#ref blah2=blah>link2</a><a href=dir/link3.html>link3</a>"));
    }

    public void testHttpEquiv() throws Exception {
        assertEquals(SetUtil.set(new String[]{"http://example.com/blah.html"}), parseSingleSource("<html><head><meta http-equiv=\"refresh\" content=\"0; url=http://example.com/blah.html\"></head></html>"));
        assertEquals(SetUtil.set(new String[]{"http://example.com/blah.html"}), parseSingleSource("<html><head><meta http-equiv=\"refresh\" content=\"0;url=http://example.com/blah.html\"></head></html>"));
    }

    public void testHttpEquiv2() throws Exception {
        assertEquals(SetUtil.set(new Object[0]), parseSingleSource("<html><head><meta http-equiv=\"blah\" content=\"0; url=http://example.com/blah.html\"></head></html>"));
    }

    private Set<String> parseSingleSource(String str) throws Exception {
        MockArchivalUnit mockArchivalUnit = new MockArchivalUnit();
        mockArchivalUnit.setLinkExtractor("text/css", new RegexpCssLinkExtractor());
        new MockCachedUrl("http://www.example.com", mockArchivalUnit).setContent(str);
        this.m_callback.reset();
        this.m_extractor.extractUrls(mockArchivalUnit, new StringInputStream(str), ENC, "http://www.example.com", this.m_callback);
        return this.m_callback.getFoundUrls();
    }

    public void testRelativeLinksWithSameName() throws Exception {
        new MockCachedUrl("http://www.example.com").setContent("<html><head><title>Test</title></head><body><a href=branch1/index.html>link1</a>Filler, with <b>bold</b> tags and<i>others</i><a href=branch2/index.html>link2</a>");
        this.m_extractor.extractUrls(this.m_mau, new StringInputStream("<html><head><title>Test</title></head><body><a href=branch1/index.html>link1</a>Filler, with <b>bold</b> tags and<i>others</i><a href=branch2/index.html>link2</a>"), ENC, "http://www.example.com", this.m_callback);
        HashSet hashSet = new HashSet();
        Collections.addAll(hashSet, "http://www.example.com/branch1/index.html", "http://www.example.com/branch2/index.html");
        assertEquals(hashSet, this.m_callback.getFoundUrls());
    }

    public void testRelativeLinksWithLeadingSlash() throws Exception {
        new MockCachedUrl("http://www.example.com/blah/").setContent("<html><head><title>Test</title></head><body><a href= branch1/index.html>link1</a>Filler, with <b>bold</b> tags and<i>others</i><a href=\" branch2/index.html\">link2</a><a href =\" /journals/american_imago/toc/aim60.1.html\"><link rel=\"stylesheet\" href=\"/css/foo.css\" ><script type=\"text/javascript\" src=\"/javascript/bar.js\"></script>Number 1, Spring 2003</a>");
        this.m_extractor.extractUrls(this.m_mau, new StringInputStream("<html><head><title>Test</title></head><body><a href= branch1/index.html>link1</a>Filler, with <b>bold</b> tags and<i>others</i><a href=\" branch2/index.html\">link2</a><a href =\" /journals/american_imago/toc/aim60.1.html\"><link rel=\"stylesheet\" href=\"/css/foo.css\" ><script type=\"text/javascript\" src=\"/javascript/bar.js\"></script>Number 1, Spring 2003</a>"), ENC, "http://www.example.com/blah/", this.m_callback);
        HashSet hashSet = new HashSet();
        Collections.addAll(hashSet, "http://www.example.com/blah/branch1/index.html", "http://www.example.com/blah/branch2/index.html", "http://www.example.com/journals/american_imago/toc/aim60.1.html", "http://www.example.com/css/foo.css", "http://www.example.com/javascript/bar.js");
        assertEquals(hashSet, this.m_callback.getFoundUrls());
    }

    public void testProtocolNeutralLinksHttp() throws Exception {
        this.m_extractor.extractUrls(this.m_mau, new StringInputStream("<html><head><title>Test</title></head><body><a href=\"//sample2.com/foo/bar.x\">link1</a><a href=\"//sample3.com/bar/bar.y\">link1</a>"), ENC, "http://www.example.com/blah/", this.m_callback);
        assertEquals(SetUtil.set(new String[]{"http://sample2.com/foo/bar.x", "http://sample3.com/bar/bar.y"}), this.m_callback.getFoundUrls());
    }

    public void testProtocolNeutralLinksHttps() throws Exception {
        this.m_extractor.extractUrls(this.m_mau, new StringInputStream("<html><head><title>Test</title></head><body><a href=\"//sample2.com/foo/bar.x\">link1</a><a href=\"//sample3.com/bar/bar.y\">link1</a>"), ENC, "https://www.example.com/blah/", this.m_callback);
        assertEquals(SetUtil.set(new String[]{"https://sample2.com/foo/bar.x", "https://sample3.com/bar/bar.y"}), this.m_callback.getFoundUrls());
    }

    private String getPageHeader(URL url) throws IOException {
        StringBuilder sb = new StringBuilder();
        URLConnection openConnection = url.openConnection();
        int i = 0;
        while (true) {
            String headerField = openConnection.getHeaderField(i);
            if (headerField == null) {
                return sb.toString();
            }
            String headerFieldKey = openConnection.getHeaderFieldKey(i);
            if (headerFieldKey != null && headerFieldKey.length() > 0) {
                sb.append(headerFieldKey);
                sb.append(" : ");
            }
            sb.append(headerField);
            sb.append(NEWLINE);
            i++;
        }
    }

    private String getPageContent(URL url) throws IOException {
        Scanner scanner = new Scanner(url.openConnection().getInputStream());
        scanner.useDelimiter(END_OF_INPUT);
        return scanner.next();
    }

    private void checkBadTags(String[] strArr, String str) throws Exception {
        for (String str2 : strArr) {
            singleTagShouldNotParse("http://www.example.com/web_link.html", str2, str);
        }
    }

    private String makeContent(String str, String str2, String str3) {
        StringBuilder sb = new StringBuilder(100);
        sb.append("<html><head><title>Test</title></head><body>");
        sb.append(str2);
        sb.append(str);
        sb.append(">");
        sb.append(str3);
        sb.append("</body></html>");
        return sb.toString();
    }
}
