package org.lockss.util;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.commons.lang3.StringUtils;
import org.lockss.crawler.TestBaseCrawler;
import org.lockss.test.ConfigurationUtil;
import org.lockss.test.LockssTestCase;
import org.lockss.test.MockCachedUrl;
import org.lockss.test.StringInputStream;
import org.lockss.util.CharsetUtil;

/* loaded from: input_file:org/lockss/util/TestCharsetUtil.class */
public class TestCharsetUtil extends LockssTestCase {
    static final byte[] UTF8_BOM = {-17, -69, -65};
    static final byte[] UTF16_BOM_BE = {-2, -1};
    static final byte[] UTF16_BOM_LE = {-1, -2};
    static final byte[] UTF32_BOM_BE = {0, 0, -2, -1};
    static final byte[] UTF32_BOM_LE = {-1, -2, 0, 0};
    static final byte[] UTF7_BOM_v1 = {43, 47, 118, 56};
    static final byte[] UTF7_BOM_v2 = {43, 47, 118, 57};
    static final byte[] UTF7_BOM_v3 = {43, 47, 118, 43};
    static final byte[] UTF7_BOM_v4 = {43, 47, 118, 47};
    static final byte[] UTF1_BOM = {-9, 100, 76};
    static final String HTML_FRAGMENT = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"\n        \"http://www.w3.org/TR/html4/loose.dtd\">\n<HTML>\n\n<head>\n";
    static final String HTML_HEADER = "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\"\n        \"http://www.w3.org/TR/html4/loose.dtd\">\n<HTML>\n\n<head>\n<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">\n<title>\nTest display of HTML elements\n</title>\n</head>";
    static final String HTML_FILE = "<HTML>\n<HEAD>\n<TITLE>HTML DOCUMENT TEST</TITLE>\n</HEAD>\n<BODY BGCOLOR=\"FFFFFF\">\n<CENTER><IMG SRC=\"clouds.jpg\" ALIGN=\"BOTTOM\"> </CENTER>\n<HR>\n<a href=\"http://somegreatsite.com\">Link Name</a>\nis a link to another nifty site\n<H1>This is a Header</H1>\n<H2>This is a Medium Header</H2>\n<h2>Character test</h2>\n<p>The following table has some sample characters with\nannotations. If the browser&#8217;s default font does not\ncontain all of them, they may get displayed using backup fonts.\nThis may cause stylistic differences, but it should not\nprevent the characters from being displayed at all.</p>\n\n<table>\n<tr><th>Char. <th>Explanation <th>Notes\n<tr><td>Ãª <td>e with circumflex <td>Latin 1 character, should be ok\n<tr><td>&#8212; <td>em dash <td>Windows Latin 1 character, should be ok, too\n<tr><td>&#x100; <td>A with macron (line above) <td>Latin Extended-A character, not present in all fonts\n<tr><td>&Omega; <td>capital omega <td>A Greek letter\n<tr><td>&#x2212; <td>minus sign <td>Unicode minus\n<tr><td>&#x2300; <td>diameter sign <td>relatively rare in fonts\n</table>\n<P> This is a new paragraph!\n<P> <B>This is a new paragraph!</B>\n<BR> <B><I>This is a new sentence without a paragraph break, in bold italics.</I></B>\n<HR>\n</BODY>\n</HTML>\n";
    static final String HTML_FILE_NOT_UTF = "<HTML>\n<HEAD>\n<TITLE>HTML DOCUMENT TEST</TITLE>\n</HEAD>\n<BODY BGCOLOR=\"FFFFFF\">\n<CENTER><IMG SRC=\"clouds.jpg\" ALIGN=\"BOTTOM\"> </CENTER>\n<HR>\n<a href=\"http://somegreatsite.com\">Link Name</a>\nis a link to another nifty site\n<H1>This is a Header</H1>\n<H2>This is a Medium Header</H2>\n<h2>Character test</h2>\n<p>The following table has some sample characters with\nannotations. If the browser&#8217;s default font does not\ncontain all of them, they may get displayed using backup fonts.\nThis may cause stylistic differences, but it should not\nprevent the characters from being displayed at all.</p>\n\n<table>\n<tr><th>Char. <th>Explanation <th>Notes\n<tr><td>&#A2; <td> a accent grave<td> accent mark\n <tr><td>&#FD; <td>one half superscriptn<td> one-half\n<tr><td>&#E4; <td>euro <td>The Euro Sign<td> The Euro sign\n<tr><td>&#A9; <td>copyright sign <td>Copyright Sign\n<tr><td>&#BF; <td>Greek<td> Greek letter\n</table>\n<P> This is a new paragraph!\n<P> <B>This is a new paragraph!</B>\n<BR> <B><I>This is a new sentence without a paragraph break, in bold italics.</I></B>\n<HR>\n</BODY>\n</HTML>\n";
    static final String NO_CHARSET_HTML = "<HTML>\n\n<head>\n</head>\n<body>\n<h1>My Website</h1>\n<p>Some text...</p>\n</body>\n</html>\n";
    static final String URL1 = "http://u.r/l";

    /* loaded from: input_file:org/lockss/util/TestCharsetUtil$NoMarkStringInputStream.class */
    static class NoMarkStringInputStream extends StringInputStream {
        NoMarkStringInputStream(String str) {
            super(str);
        }

        @Override // org.lockss.test.StringInputStream, java.io.InputStream
        public boolean markSupported() {
            return false;
        }
    }

    String metaTag(String str) {
        return "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=" + str + "\">\n";
    }

    public void testGuessCharsetFromBytes() throws Exception {
        assertEquals("UTF-8", CharsetUtil.guessCharsetFromBytes(HTML_FILE.getBytes()));
        assertEquals("UTF-8", CharsetUtil.guessCharsetFromBytes(HTML_FILE.getBytes("UTF-8")));
        assertEquals("UTF-16BE", CharsetUtil.guessCharsetFromBytes(HTML_FILE.getBytes("UTF-16")));
    }

    public void testGuessCharsetFromStream() throws Exception {
        assertEquals("UTF-8", CharsetUtil.guessCharsetFromStream(new ByteArrayInputStream(HTML_FILE.getBytes())));
        assertEquals("UTF-8", CharsetUtil.guessCharsetFromStream(new ByteArrayInputStream(HTML_FILE.getBytes("UTF-8"))));
        assertEquals("UTF-16BE", CharsetUtil.guessCharsetFromStream(new ByteArrayInputStream(HTML_FILE.getBytes("UTF-16"))));
        assertEquals("ISO-8859-1", CharsetUtil.guessCharsetFromStream(new ByteArrayInputStream(HTML_FILE_NOT_UTF.getBytes("iso-8859-1")), "ISO-8859-1"));
    }

    public void testCu() throws IOException {
        MockCachedUrl mockCachedUrl = new MockCachedUrl(URL1);
        mockCachedUrl.storeContent(new ByteArrayInputStream(HTML_FILE_NOT_UTF.getBytes("iso-8859-1")));
        assertEquals("ISO-8859-1", CharsetUtil.getCharsetStream(mockCachedUrl).getCharset());
        assertTrue(mockCachedUrl.getUncompressedCalled());
        MockCachedUrl mockCachedUrl2 = new MockCachedUrl(URL1);
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(HTML_FILE_NOT_UTF.getBytes("iso-8859-1"));
        CIProperties cIProperties = new CIProperties();
        cIProperties.put("X-Lockss-content-type", "UTF-8");
        mockCachedUrl2.storeContent(byteArrayInputStream, cIProperties);
        assertEquals("ISO-8859-1", CharsetUtil.getCharsetStream(mockCachedUrl2).getCharset());
        assertTrue(mockCachedUrl2.getUncompressedCalled());
        MockCachedUrl mockCachedUrl3 = new MockCachedUrl(URL1);
        ByteArrayInputStream byteArrayInputStream2 = new ByteArrayInputStream(ArrayUtils.addAll(UTF16_BOM_LE, NO_CHARSET_HTML.getBytes("ISO-8859-1")));
        CIProperties cIProperties2 = new CIProperties();
        cIProperties2.put("X-Lockss-content-type", "UTF-8");
        mockCachedUrl3.storeContent(byteArrayInputStream2, cIProperties2);
        assertEquals("UTF-16LE", CharsetUtil.getCharsetStream(mockCachedUrl3).getCharset());
        assertTrue(mockCachedUrl3.getUncompressedCalled());
    }

    public void testFindCharsetInText() throws Exception {
        byte[] bytes = HTML_HEADER.getBytes();
        assertEquals("ISO-8859-1", CharsetUtil.findCharsetInText(bytes, bytes.length));
        byte[] bytes2 = "<!DOCTYPE html>\n<html>\n<head>\n<meta charset=\"UTF-8\">\n</head>\n<body>\n<h1>My Website</h1>\n<p>Some text...</p>\n</body>\n</html>\n".getBytes();
        assertEquals("UTF-8", CharsetUtil.findCharsetInText(bytes2, bytes2.length));
        byte[] bytes3 = "<!DOCTYPE html>\n<html>\n\n<head>\n<TITLE>CHARSET=\"UTF-8\"</TITLE>\n</head>\n\n<body>\n<h1>My Website</h1>\n<p>Some text...</p>\n</body>\n\n</html>\n".getBytes();
        assertEquals((String) null, CharsetUtil.findCharsetInText(bytes3, bytes3.length));
        byte[] bytes4 = "<?xml version=\"1.0\"?>\n".getBytes();
        assertEquals((String) null, CharsetUtil.findCharsetInText(bytes4, bytes4.length));
        byte[] bytes5 = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n".getBytes();
        assertEquals("UTF-8", CharsetUtil.findCharsetInText(bytes5, bytes5.length));
        byte[] bytes6 = "<?xml version=\"1.0\" encoding=\"UTF-16\" standalone=\"yes\"?>".getBytes();
        assertEquals("UTF-16", CharsetUtil.findCharsetInText(bytes6, bytes6.length));
    }

    public void testJoinStreamsWithCharset() throws Exception {
        byte[] bArr = new byte[100];
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(HTML_FILE.getBytes("UTF-8"));
        byteArrayInputStream.available();
        byteArrayInputStream.read(bArr, 0, bArr.length);
        String str = new String(bArr, "UTF-8");
        int length = str.length();
        assertEquals(100, str.getBytes().length);
        InputStreamReader inputStreamReader = new InputStreamReader(CharsetUtil.joinStreamsWithCharset(bArr, byteArrayInputStream, "UTF-8"), "UTF-8");
        char[] cArr = new char[length];
        inputStreamReader.read(cArr);
        assertEquals(str.toCharArray(), cArr);
    }

    public void testSupportedCharsetName() throws Exception {
        assertNull(CharsetUtil.supportedCharsetName("bogus"));
        assertEquals("UTF-8", CharsetUtil.supportedCharsetName("utf-8"));
        assertEquals("UTF-8", CharsetUtil.supportedCharsetName("utf8"));
        assertEquals("UTF-8", CharsetUtil.supportedCharsetName("UTF-8"));
        assertEquals("UTF-32", CharsetUtil.supportedCharsetName("UTF_32"));
    }

    public void testIsAlnum() throws Exception {
        assertTrue(CharsetUtil.isAlnum((byte) 97));
        assertTrue(CharsetUtil.isAlnum((byte) 49));
        assertFalse(CharsetUtil.isAlnum((byte) 32));
        assertFalse(CharsetUtil.isAlnum((byte) 9));
    }

    public void testIsSpace() throws Exception {
        assertFalse(CharsetUtil.isSpace((byte) 97));
        assertFalse(CharsetUtil.isSpace((byte) 49));
        assertTrue(CharsetUtil.isSpace((byte) 32));
        assertTrue(CharsetUtil.isSpace((byte) 9));
    }

    public void testGuessCharsetNoMark() throws Exception {
        try {
            CharsetUtil.guessCharsetFromStream(new NoMarkStringInputStream("foo"));
            fail("guessCharsetFromStream should require markSupported()");
        } catch (IllegalArgumentException e) {
        }
        try {
            CharsetUtil.guessCharsetName(new NoMarkStringInputStream("foo"));
            fail("guessCharsetName should require markSupported()");
        } catch (IllegalArgumentException e2) {
        }
    }

    String guessCharsetName(byte[] bArr) throws IOException {
        return CharsetUtil.guessCharsetName(new ByteArrayInputStream(ArrayUtils.addAll(bArr, NO_CHARSET_HTML.getBytes("ISO-8859-1"))));
    }

    public void testGuessCharsetName() throws Exception {
        assertEquals("UTF-8", guessCharsetName(UTF8_BOM));
        assertEquals("UTF-16LE", guessCharsetName(UTF16_BOM_LE));
        assertEquals("UTF-16BE", guessCharsetName(UTF16_BOM_BE));
        if (CharsetUtil.supportedCharsetName("UTF-32LE") != null) {
            assertEquals("UTF-32LE", guessCharsetName(UTF32_BOM_LE));
            assertEquals("UTF-32BE", guessCharsetName(UTF32_BOM_BE));
        }
        if (CharsetUtil.supportedCharsetName("UTF-7") != null) {
            assertEquals("UTF-7", guessCharsetName(UTF7_BOM_v1));
            assertEquals("UTF-7", guessCharsetName(UTF7_BOM_v2));
            assertEquals("UTF-7", guessCharsetName(UTF7_BOM_v3));
            assertEquals("UTF-7", guessCharsetName(UTF7_BOM_v4));
        }
        if (CharsetUtil.supportedCharsetName("UTF-1") != null) {
            assertEquals("UTF-1", guessCharsetName(UTF1_BOM));
        } else {
            assertEquals("UTF-8", guessCharsetName(UTF1_BOM));
        }
    }

    void assertStreamCharsetFromBOM(String str, byte[] bArr) throws IOException {
        CharsetUtil.InputStreamAndCharset charsetStream = CharsetUtil.getCharsetStream(new ByteArrayInputStream(ArrayUtils.addAll(bArr, NO_CHARSET_HTML.getBytes("ISO-8859-1"))));
        assertEquals(str, charsetStream.getCharset());
        assertReaderMatchesString(NO_CHARSET_HTML, new InputStreamReader(charsetStream.getInStream(), "ISO-8859-1"));
    }

    void assertStreamCharset(String str, String str2) throws IOException {
        assertEquals(str, CharsetUtil.getCharsetStream(new ByteArrayInputStream(str2.getBytes("ISO-8859-1"))).getCharset());
    }

    public void testGetCharsetStream() throws Exception {
        assertStreamCharsetFromBOM("UTF-8", UTF8_BOM);
        assertStreamCharsetFromBOM("UTF-16LE", UTF16_BOM_LE);
        assertStreamCharsetFromBOM("UTF-16BE", UTF16_BOM_BE);
        if (CharsetUtil.supportedCharsetName("UTF-32LE") != null) {
            assertStreamCharsetFromBOM("UTF-32LE", UTF32_BOM_LE);
            assertStreamCharsetFromBOM("UTF-32BE", UTF32_BOM_BE);
        }
        if (CharsetUtil.supportedCharsetName("UTF-7") != null) {
            assertStreamCharsetFromBOM("UTF-7", UTF7_BOM_v1);
            assertStreamCharsetFromBOM("UTF-7", UTF7_BOM_v2);
            assertStreamCharsetFromBOM("UTF-7", UTF7_BOM_v3);
            assertStreamCharsetFromBOM("UTF-7", UTF7_BOM_v4);
        }
        if (CharsetUtil.supportedCharsetName("UTF-1") != null) {
            assertStreamCharsetFromBOM("UTF-1", UTF1_BOM);
        } else {
            assertStreamCharsetFromBOM("UTF-8", UTF1_BOM);
        }
    }

    public void testBufSize() throws Exception {
        assertStreamCharset("ISO-8859-1", HTML_HEADER);
        ConfigurationUtil.setFromArgs("org.lockss.crawler.inferCharsetBufSize", "4");
        assertStreamCharset("ISO-8859-2", HTML_HEADER);
        String str = HTML_FRAGMENT + StringUtils.repeat("          ", 2000) + metaTag("UTF-8");
        assertStreamCharset("ISO-8859-2", str);
        ConfigurationUtil.setFromArgs("org.lockss.crawler.inferCharsetBufSize", TestBaseCrawler.EMPTY_PAGE);
        assertStreamCharset("UTF-8", str);
    }

    public void testHasUtf8BOM() throws Exception {
        assertTrue(CharsetUtil.hasUtf8BOM(UTF8_BOM, UTF8_BOM.length));
        assertFalse(CharsetUtil.hasUtf8BOM(UTF16_BOM_BE, UTF16_BOM_BE.length));
        assertFalse(CharsetUtil.hasUtf8BOM(UTF16_BOM_LE, UTF16_BOM_LE.length));
        assertFalse(CharsetUtil.hasUtf8BOM(UTF32_BOM_BE, UTF32_BOM_BE.length));
        assertFalse(CharsetUtil.hasUtf8BOM(UTF32_BOM_LE, UTF32_BOM_LE.length));
        assertFalse(CharsetUtil.hasUtf8BOM(UTF7_BOM_v1, UTF7_BOM_v1.length));
        assertFalse(CharsetUtil.hasUtf8BOM(UTF7_BOM_v2, UTF7_BOM_v2.length));
        assertFalse(CharsetUtil.hasUtf8BOM(UTF7_BOM_v3, UTF7_BOM_v3.length));
        assertFalse(CharsetUtil.hasUtf8BOM(UTF7_BOM_v4, UTF7_BOM_v4.length));
        assertFalse(CharsetUtil.hasUtf8BOM(UTF1_BOM, UTF1_BOM.length));
    }

    public void testHasUtf16BEBOM() throws Exception {
        assertFalse(CharsetUtil.hasUtf16BEBOM(UTF8_BOM, UTF8_BOM.length));
        assertTrue(CharsetUtil.hasUtf16BEBOM(UTF16_BOM_BE, UTF16_BOM_BE.length));
        assertFalse(CharsetUtil.hasUtf16BEBOM(UTF16_BOM_LE, UTF16_BOM_LE.length));
        assertFalse(CharsetUtil.hasUtf16BEBOM(UTF32_BOM_BE, UTF32_BOM_BE.length));
        assertFalse(CharsetUtil.hasUtf16BEBOM(UTF32_BOM_LE, UTF32_BOM_LE.length));
        assertFalse(CharsetUtil.hasUtf16BEBOM(UTF7_BOM_v1, UTF7_BOM_v1.length));
        assertFalse(CharsetUtil.hasUtf16BEBOM(UTF7_BOM_v2, UTF7_BOM_v2.length));
        assertFalse(CharsetUtil.hasUtf16BEBOM(UTF7_BOM_v3, UTF7_BOM_v3.length));
        assertFalse(CharsetUtil.hasUtf16BEBOM(UTF7_BOM_v4, UTF7_BOM_v4.length));
        assertFalse(CharsetUtil.hasUtf16BEBOM(UTF1_BOM, UTF1_BOM.length));
    }

    public void testHasUtf16LEBOM() throws Exception {
        assertFalse(CharsetUtil.hasUtf16LEBOM(UTF8_BOM, UTF8_BOM.length));
        assertFalse(CharsetUtil.hasUtf16LEBOM(UTF16_BOM_BE, UTF16_BOM_BE.length));
        assertTrue(CharsetUtil.hasUtf16LEBOM(UTF16_BOM_LE, UTF16_BOM_LE.length));
        assertFalse(CharsetUtil.hasUtf16LEBOM(UTF32_BOM_BE, UTF32_BOM_BE.length));
        assertTrue(CharsetUtil.hasUtf16LEBOM(UTF32_BOM_LE, UTF32_BOM_LE.length));
        assertFalse(CharsetUtil.hasUtf16LEBOM(UTF7_BOM_v1, UTF7_BOM_v1.length));
        assertFalse(CharsetUtil.hasUtf16LEBOM(UTF7_BOM_v2, UTF7_BOM_v2.length));
        assertFalse(CharsetUtil.hasUtf16LEBOM(UTF7_BOM_v3, UTF7_BOM_v3.length));
        assertFalse(CharsetUtil.hasUtf16LEBOM(UTF7_BOM_v4, UTF7_BOM_v4.length));
        assertFalse(CharsetUtil.hasUtf16LEBOM(UTF1_BOM, UTF1_BOM.length));
    }

    public void testHasUtf32BEBOM() throws Exception {
        assertFalse(CharsetUtil.hasUtf32BEBOM(UTF8_BOM, UTF8_BOM.length));
        assertFalse(CharsetUtil.hasUtf32BEBOM(UTF16_BOM_BE, UTF16_BOM_BE.length));
        assertFalse(CharsetUtil.hasUtf32BEBOM(UTF16_BOM_LE, UTF16_BOM_LE.length));
        assertTrue(CharsetUtil.hasUtf32BEBOM(UTF32_BOM_BE, UTF32_BOM_BE.length));
        assertFalse(CharsetUtil.hasUtf32BEBOM(UTF32_BOM_LE, UTF32_BOM_LE.length));
        assertFalse(CharsetUtil.hasUtf32BEBOM(UTF7_BOM_v1, UTF7_BOM_v1.length));
        assertFalse(CharsetUtil.hasUtf32BEBOM(UTF7_BOM_v2, UTF7_BOM_v2.length));
        assertFalse(CharsetUtil.hasUtf32BEBOM(UTF7_BOM_v3, UTF7_BOM_v3.length));
        assertFalse(CharsetUtil.hasUtf32BEBOM(UTF7_BOM_v4, UTF7_BOM_v4.length));
        assertFalse(CharsetUtil.hasUtf32BEBOM(UTF1_BOM, UTF1_BOM.length));
    }

    public void testHasUtf32LEBOM() throws Exception {
        assertFalse(CharsetUtil.hasUtf32LEBOM(UTF8_BOM, UTF8_BOM.length));
        assertFalse(CharsetUtil.hasUtf32LEBOM(UTF16_BOM_BE, UTF16_BOM_BE.length));
        assertFalse(CharsetUtil.hasUtf32LEBOM(UTF16_BOM_LE, UTF16_BOM_LE.length));
        assertFalse(CharsetUtil.hasUtf32LEBOM(UTF32_BOM_BE, UTF32_BOM_BE.length));
        assertTrue(CharsetUtil.hasUtf32LEBOM(UTF32_BOM_LE, UTF32_BOM_LE.length));
        assertFalse(CharsetUtil.hasUtf32LEBOM(UTF7_BOM_v1, UTF7_BOM_v1.length));
        assertFalse(CharsetUtil.hasUtf32LEBOM(UTF7_BOM_v2, UTF7_BOM_v2.length));
        assertFalse(CharsetUtil.hasUtf32LEBOM(UTF7_BOM_v3, UTF7_BOM_v3.length));
        assertFalse(CharsetUtil.hasUtf32LEBOM(UTF7_BOM_v4, UTF7_BOM_v4.length));
        assertFalse(CharsetUtil.hasUtf32LEBOM(UTF1_BOM, UTF1_BOM.length));
    }

    public void testHasUtf7BOM() throws Exception {
        assertFalse(CharsetUtil.hasUtf7BOM(UTF8_BOM, UTF8_BOM.length));
        assertFalse(CharsetUtil.hasUtf7BOM(UTF16_BOM_BE, UTF16_BOM_BE.length));
        assertFalse(CharsetUtil.hasUtf7BOM(UTF16_BOM_LE, UTF16_BOM_LE.length));
        assertFalse(CharsetUtil.hasUtf7BOM(UTF32_BOM_BE, UTF32_BOM_BE.length));
        assertFalse(CharsetUtil.hasUtf7BOM(UTF32_BOM_LE, UTF32_BOM_LE.length));
        assertTrue(CharsetUtil.hasUtf7BOM(UTF7_BOM_v1, UTF7_BOM_v1.length));
        assertTrue(CharsetUtil.hasUtf7BOM(UTF7_BOM_v2, UTF7_BOM_v2.length));
        assertTrue(CharsetUtil.hasUtf7BOM(UTF7_BOM_v3, UTF7_BOM_v3.length));
        assertTrue(CharsetUtil.hasUtf7BOM(UTF7_BOM_v4, UTF7_BOM_v4.length));
        assertFalse(CharsetUtil.hasUtf7BOM(UTF1_BOM, UTF1_BOM.length));
    }

    public void testHasUtf1BOM() throws Exception {
        assertFalse(CharsetUtil.hasUtf1BOM(UTF8_BOM, UTF8_BOM.length));
        assertFalse(CharsetUtil.hasUtf1BOM(UTF16_BOM_BE, UTF16_BOM_BE.length));
        assertFalse(CharsetUtil.hasUtf1BOM(UTF16_BOM_LE, UTF16_BOM_LE.length));
        assertFalse(CharsetUtil.hasUtf1BOM(UTF32_BOM_BE, UTF32_BOM_BE.length));
        assertFalse(CharsetUtil.hasUtf1BOM(UTF32_BOM_LE, UTF32_BOM_LE.length));
        assertFalse(CharsetUtil.hasUtf1BOM(UTF7_BOM_v1, UTF7_BOM_v1.length));
        assertFalse(CharsetUtil.hasUtf1BOM(UTF7_BOM_v2, UTF7_BOM_v2.length));
        assertFalse(CharsetUtil.hasUtf1BOM(UTF7_BOM_v3, UTF7_BOM_v3.length));
        assertFalse(CharsetUtil.hasUtf1BOM(UTF7_BOM_v4, UTF7_BOM_v4.length));
        assertTrue(CharsetUtil.hasUtf1BOM(UTF1_BOM, UTF1_BOM.length));
    }

    public final void testEmptyDocument() throws IOException {
        assertCharset(TestBaseCrawler.EMPTY_PAGE, new byte[0], "UTF-8");
    }

    public final void testMetaHttpEquiv() throws IOException {
        assertCharset("<html><head><meta http-equiv=\"Content-type\" value=\"text/html;charset=UTF-8\"></head><body>Hello, World!</body></html>", "<html><head><meta http-equiv=\"Content-type\" value=\"text/html;charset=UTF-8\"></head><body>Hello, World!</body></html>".getBytes("UTF-8"), "UTF-8");
        assertCharset("<html><head><meta http-equiv=\"Content-type\" value=\"text/html;charset =UTF-16BE\"></head><body>Hello, World!</body></html>", "<html><head><meta http-equiv=\"Content-type\" value=\"text/html;charset =UTF-16BE\"></head><body>Hello, World!</body></html>".getBytes("UTF-16BE"), "UTF-16BE");
        assertCharset("<html><head><meta http-equiv=\"Content-type\" value=\"text/html;charset= 'UTF-16LE\"></head><body>Hello, World!</body></html>", "<html><head><meta http-equiv=\"Content-type\" value=\"text/html;charset= 'UTF-16LE\"></head><body>Hello, World!</body></html>".getBytes("UTF-16LE"), "UTF-16LE");
    }

    public final void testBOM() throws IOException {
        String str = "\ufeff<html>Hello, World!</html>";
        assertCharset("<html>Hello, World!</html>", str.getBytes("UTF-8"), "UTF-8");
        assertCharset("<html>Hello, World!</html>", str.getBytes("UTF-16LE"), "UTF-16LE");
        assertCharset("<html>Hello, World!</html>", str.getBytes("UTF-16BE"), "UTF-16BE");
        if (CharsetUtil.supportedCharsetName("UTF-32LE") != null) {
            assertCharset("<html>Hello, World!</html>", str.getBytes("UTF-32LE"), "UTF-32LE");
            assertCharset("<html>Hello, World!</html>", str.getBytes("UTF-32BE"), "UTF-32BE");
        }
        if (CharsetUtil.supportedCharsetName("UTF-7") != null) {
            assertCharset("<html>Hello, World!</html>", str.getBytes("UTF-7"), "UTF-7");
        }
        if (CharsetUtil.supportedCharsetName("UTF-1") != null) {
            assertCharset("<html>Hello, World!</html>", str.getBytes("UTF-1"), "UTF-1");
        }
    }

    public final void testCharsetInXmlHeader() throws IOException {
        String str = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><html>Hello, World!</html>";
        assertCharset(str, str.getBytes("UTF-8"), "UTF-8");
        ConfigurationUtil.setFromArgs("org.lockss.crawler.inferCharsetBufSize", "10");
        assertCharset(str, str.getBytes("UTF-8"), "ISO-8859-1");
        ConfigurationUtil.setFromArgs("org.lockss.crawler.inferCharsetBufSize", TestBaseCrawler.EMPTY_PAGE);
        String str2 = "<?xml version=\"1.0\" encoding=\"UTF-16BE\"?><html>Hello, World!</html>";
        assertCharset(str2, str2.getBytes("UTF-16BE"), "UTF-16BE");
        String str3 = "<?xml version=\"1.0\" encoding=\"UTF-16LE\"?><html>Hello, World!</html>";
        assertCharset(str3, str3.getBytes("UTF-16LE"), "UTF-16LE");
    }

    public final void testCharsetNotInHeader() throws IOException {
        assertEquals("ISO-8859-1", CharsetUtil.guessCharsetName(new BufferedInputStream(new FileInputStream(writeEncodedFile(HTML_FILE_NOT_UTF, "ISO-8859-1")))));
        assertEquals("UTF-8", CharsetUtil.guessCharsetName(new BufferedInputStream(new FileInputStream(writeEncodedFile(HTML_FILE, "UTF-8")))));
    }

    public final void testCharsetInText() throws IOException {
        for (String str : new String[]{"UTF-8", "UTF-16LE", "UTF-16BE"}) {
            assertCharset("<html><head><title>charset=UTF-16LE</title></head><body>Hello, World!</body></html>", ("\ufeff<html><head><title>charset=UTF-16LE</title></head><body>Hello, World!</body></html>").getBytes(str), str);
        }
    }

    private static void assertCharset(String str, byte[] bArr, String str2) throws IOException {
        InputStreamReader reader = CharsetUtil.getReader(new ByteArrayInputStream(bArr), str2);
        assertEquals(str2, Charset.forName(reader.getEncoding()).displayName());
        new StringBuilder();
        assertEquals(str, StringUtil.fromReader(reader));
    }

    private File writeEncodedFile(String str, String str2) throws IOException {
        File tempFile = getTempFile("charset", str2);
        OutputStreamWriter outputStreamWriter = new OutputStreamWriter(new FileOutputStream(tempFile), Charset.forName(str2).newEncoder());
        outputStreamWriter.write(str);
        outputStreamWriter.flush();
        outputStreamWriter.close();
        return tempFile;
    }
}
