package it.unimi.dsi.law.warc.tool;

import com.martiansoftware.jsap.FlaggedOption;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.Switch;
import com.martiansoftware.jsap.UnflaggedOption;
import it.unimi.dsi.big.util.StringMap;
import it.unimi.dsi.fastutil.ints.IntArrays;
import it.unimi.dsi.fastutil.ints.IntOpenHashSet;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.io.FastBufferedInputStream;
import it.unimi.dsi.fastutil.io.FastBufferedOutputStream;
import it.unimi.dsi.law.warc.filters.Filter;
import it.unimi.dsi.law.warc.filters.parser.FilterParser;
import it.unimi.dsi.law.warc.io.GZWarcRecord;
import it.unimi.dsi.law.warc.io.HttpResponseFilteredIterator;
import it.unimi.dsi.law.warc.io.WarcRecord;
import it.unimi.dsi.law.warc.parser.HTMLParser;
import it.unimi.dsi.law.warc.util.HttpResponse;
import it.unimi.dsi.law.warc.util.WarcHttpResponse;
import it.unimi.dsi.logging.ProgressLogger;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.net.URI;
import java.util.Iterator;
import java.util.concurrent.TimeUnit;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:it/unimi/dsi/law/warc/tool/ExtractLinks.class */
public class ExtractLinks {
    private static final Logger LOGGER = LoggerFactory.getLogger(ExtractLinks.class);
    public static final String DEFAULT_BUFFER_SIZE = "64Ki";

    public static void run(FastBufferedInputStream fastBufferedInputStream, boolean z, Filter<HttpResponse> filter, PrintWriter printWriter, StringMap<? extends CharSequence> stringMap, StringMap<? extends CharSequence> stringMap2) throws IOException {
        int i;
        WarcRecord gZWarcRecord = z ? new GZWarcRecord() : new WarcRecord();
        WarcHttpResponse warcHttpResponse = new WarcHttpResponse();
        HttpResponseFilteredIterator httpResponseFilteredIterator = new HttpResponseFilteredIterator(fastBufferedInputStream, gZWarcRecord, warcHttpResponse, filter);
        HTMLParser hTMLParser = new HTMLParser();
        HTMLParser.SetLinkReceiver setLinkReceiver = new HTMLParser.SetLinkReceiver();
        IntOpenHashSet intOpenHashSet = new IntOpenHashSet();
        int[] iArr = IntArrays.EMPTY_ARRAY;
        ProgressLogger progressLogger = new ProgressLogger(LOGGER, 1L, TimeUnit.MINUTES, "pages");
        progressLogger.start("Extracting...");
        while (httpResponseFilteredIterator.hasNext()) {
            httpResponseFilteredIterator.next();
            if (stringMap != null) {
                int i2 = (int) stringMap.getLong(warcHttpResponse.uri().toString());
                if (warcHttpResponse.isDuplicate()) {
                    if (i2 >= 0) {
                        LOGGER.error("URL " + warcHttpResponse.uri() + " is contained in the URL map but it is a duplicate");
                        printWriter.println(i2);
                        progressLogger.update();
                    }
                } else if (i2 == -1) {
                    LOGGER.error("URL " + warcHttpResponse.uri() + " is not contained in the URL map; this may happen if the original digest/URL file was sorted unstably or if there are several non-duplicate pages with the same digest");
                } else {
                    printWriter.print(i2);
                    printWriter.print('\t');
                    hTMLParser.parse(warcHttpResponse, setLinkReceiver);
                    intOpenHashSet.clear();
                    Iterator<URI> it2 = setLinkReceiver.iterator();
                    while (it2.hasNext()) {
                        URI next = it2.next();
                        int i3 = (int) stringMap.getLong(next.toString());
                        if (i3 != -1) {
                            LOGGER.debug("Adding successor " + next + ":" + i3);
                            intOpenHashSet.add(i3);
                        } else if (stringMap2 != null && (i = (int) stringMap2.getLong(next.toString())) != -1) {
                            LOGGER.debug("Adding duplicate " + next + ":" + i);
                            intOpenHashSet.add(i);
                        }
                    }
                    int size = intOpenHashSet.size();
                    int[] grow = IntArrays.grow(iArr, size, 0);
                    iArr = grow;
                    intOpenHashSet.toArray(grow);
                    IntArrays.quickSort(iArr, 0, size);
                    for (int i4 = 0; i4 < size; i4++) {
                        printWriter.print(iArr[i4]);
                        printWriter.print('\t');
                    }
                }
            } else {
                printWriter.print(warcHttpResponse.uri());
                hTMLParser.parse(warcHttpResponse, setLinkReceiver);
                Iterator<URI> it3 = setLinkReceiver.iterator();
                while (it3.hasNext()) {
                    URI next2 = it3.next();
                    printWriter.print('\t');
                    printWriter.print(next2);
                }
            }
            printWriter.println();
            progressLogger.update();
        }
        progressLogger.done();
    }

    public static void main(String[] strArr) throws Exception {
        InputStream fileInputStream;
        SimpleJSAP simpleJSAP = new SimpleJSAP(ExtractLinks.class.getName(), "Extract links in pages from a WARC file.", new Parameter[]{new FlaggedOption("bufferSize", JSAP.INTSIZE_PARSER, "64Ki", false, 'b', "buffer-size", "The size of an I/O buffer."), new Switch("gzip", 'z', "gzip", "Tells if the warc is compressed."), new FlaggedOption("filter", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'f', "filter", "The filter."), new FlaggedOption("start", JSAP.LONG_PARSER, JSAP.NO_DEFAULT, false, 's', "start", "The starting offset (in bytes) in the WARC file (mainly for debugging purposes)."), new FlaggedOption("duplicates", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'd', "duplicates", "The (remapped) term map for duplicate URLs. If not present, only links pointing to URLs in <urls> will be used."), new FlaggedOption("urls", JSAP.STRING_PARSER, JSAP.NO_DEFAULT, false, 'u', "The term map for the node URLs."), new UnflaggedOption("warcFile", JSAP.STRING_PARSER, "-", true, false, "The WARC file basename (if not present, or -, stdin will be used).")});
        JSAPResult parse = simpleJSAP.parse(strArr);
        if (simpleJSAP.messagePrinted()) {
            System.exit(1);
        }
        boolean z = parse.getBoolean("gzip");
        String string = parse.getString("filter") == null ? "TRUE" : parse.getString("filter");
        String string2 = parse.getString("warcFile");
        int i = parse.getInt("bufferSize");
        Filter parse2 = new FilterParser(HttpResponse.class).parse(string);
        StringMap stringMap = (StringMap) (parse.userSpecified("urls") ? BinIO.loadObject(parse.getString("urls")) : null);
        StringMap stringMap2 = (StringMap) (parse.userSpecified("duplicates") ? BinIO.loadObject(parse.getString("duplicates")) : null);
        if (string2.equals("-")) {
            fileInputStream = System.in;
        } else {
            fileInputStream = new FileInputStream(new File(string2 + ".warc" + (z ? ".gz" : "")));
        }
        FastBufferedInputStream fastBufferedInputStream = new FastBufferedInputStream(fileInputStream, i);
        if (parse.userSpecified("start")) {
            fastBufferedInputStream.skip(parse.getLong("start"));
        }
        PrintWriter printWriter = new PrintWriter(new OutputStreamWriter((OutputStream) new FastBufferedOutputStream(System.out, i), "ASCII"));
        try {
            run(fastBufferedInputStream, z, parse2, printWriter, stringMap, stringMap2);
            fastBufferedInputStream.close();
            printWriter.close();
        } catch (Throwable th) {
            fastBufferedInputStream.close();
            printWriter.close();
            throw th;
        }
    }
}
