package it.unimi.dsi.law.scratch;

import com.google.common.base.Charsets;
import com.martiansoftware.jsap.JSAP;
import com.martiansoftware.jsap.JSAPException;
import com.martiansoftware.jsap.JSAPResult;
import com.martiansoftware.jsap.Parameter;
import com.martiansoftware.jsap.SimpleJSAP;
import com.martiansoftware.jsap.UnflaggedOption;
import info.bliki.wiki.filter.Encoder;
import it.unimi.di.big.mg4j.tool.URLMPHVirtualDocumentResolver;
import it.unimi.dsi.fastutil.io.BinIO;
import it.unimi.dsi.fastutil.longs.Long2LongOpenHashMap;
import it.unimi.dsi.logging.ProgressLogger;
import java.io.BufferedReader;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.URLDecoder;
import java.util.List;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:it/unimi/dsi/law/scratch/PageviewsExtractor.class */
public class PageviewsExtractor {
    private static final int MISSING = -1;
    private static final int SHOULD_NOT_BE_COUNTED = -2;
    private static final boolean UPPER_FIRST_CHAR = true;
    private static final Logger LOGGER = LoggerFactory.getLogger(PageviewsExtractor.class);
    private static URLMPHVirtualDocumentResolver vdr;

    private static long keyOfPagename(String str) {
        return keyOfPagename(str, 0);
    }

    private static long keyOfPagename(String str, int i) {
        String strip;
        int i2 = i + 1;
        if (i == 3) {
            return -1L;
        }
        String strip2 = StringUtils.strip(str, "_ ");
        if (!isAnArticle(str)) {
            return -2L;
        }
        long resolve = vdr.resolve("http://en.wikipedia.org/wiki/" + str);
        if (resolve >= 0) {
            return resolve;
        }
        try {
            strip = StringUtils.strip(Encoder.encodeTitleToUrl(strip2, true), "_ ");
        } catch (IllegalArgumentException e) {
        }
        if (!isAnArticle(strip)) {
            return -2L;
        }
        long resolve2 = vdr.resolve("http://en.wikipedia.org/wiki/" + strip);
        if (resolve2 >= 0) {
            return resolve2;
        }
        long keyOfPagename = keyOfPagename(strip, i2);
        if (keyOfPagename >= 0) {
            return keyOfPagename;
        }
        try {
            String decode = URLDecoder.decode(strip2, "UTF-8");
            if (!isAnArticle(decode)) {
                return -2L;
            }
            long resolve3 = vdr.resolve("http://en.wikipedia.org/wiki/" + decode);
            if (resolve3 >= 0) {
                return resolve3;
            }
            long keyOfPagename2 = keyOfPagename(decode, i2);
            if (keyOfPagename2 >= 0) {
                return keyOfPagename2;
            }
            return -1L;
        } catch (UnsupportedEncodingException | IllegalArgumentException e2) {
            return -1L;
        }
    }

    private static long keyOfTitle(String str) {
        return vdr.resolve("http://en.wikipedia.org/wiki/" + Encoder.encodeTitleToUrl(str, true));
    }

    private static boolean isAnArticle(String str) {
        return (str.indexOf(58) != -1 || str.startsWith("#") || str.startsWith("//")) ? false : true;
    }

    public static void main(String[] strArr) throws IOException, JSAPException, ClassNotFoundException {
        SimpleJSAP simpleJSAP = new SimpleJSAP(PageviewsExtractor.class.getName(), "Read lines from StdIn in the form <page name> <page view> and output a serialized rank file with ranks for each enwiki title, ordered according to the provided enwiki.titles file. ", new Parameter[]{new UnflaggedOption("enwiki.titles", JSAP.STRING_PARSER, true, "A file containing titles from enwiki, one per line."), new UnflaggedOption("enwikired.vdr", JSAP.STRING_PARSER, true, "The Virtual Document Resolver of en.wiki that resolves redirects."), new UnflaggedOption("missing-pagenames", JSAP.STRING_PARSER, true, "File that will be saved with all pagenames that were not found in titles."), new UnflaggedOption("rank", JSAP.STRING_PARSER, true, "Output rank file.")});
        JSAPResult parse = simpleJSAP.parse(strArr);
        if (simpleJSAP.messagePrinted()) {
            System.exit(1);
        }
        LOGGER.info("Loading redirects...");
        vdr = (URLMPHVirtualDocumentResolver) BinIO.loadObject(parse.getString("enwikired.vdr"));
        LOGGER.info("Reading titles from the input file...");
        List<String> readLines = IOUtils.readLines(new FileInputStream(parse.getString("enwiki.titles")), Charsets.UTF_8);
        LOGGER.info("Checking existing titles...");
        for (String str : readLines) {
            if (keyOfTitle(str) == -1) {
                throw new IllegalStateException("title file and virtual document resolver do not match: \"" + str + "\" do not resolve to anything.");
            }
        }
        Long2LongOpenHashMap long2LongOpenHashMap = new Long2LongOpenHashMap();
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(System.in));
        ProgressLogger progressLogger = new ProgressLogger(LOGGER);
        FileWriter fileWriter = new FileWriter(parse.getString("missing-pagenames"));
        int i = 0;
        int i2 = 0;
        progressLogger.start("Parsing page counts from stdIn...");
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                break;
            }
            String[] split = readLine.split(" ", 2);
            long keyOfPagename = keyOfPagename(split[0]);
            if (keyOfPagename >= 0) {
                long2LongOpenHashMap.addTo(keyOfPagename, Long.parseLong(split[1]));
                i2++;
            } else if (keyOfPagename == -1) {
                i++;
                fileWriter.write(readLine + "\n");
            }
            progressLogger.lightUpdate();
        }
        progressLogger.done();
        fileWriter.close();
        Logger logger = LOGGER;
        parse.getString("missing-pagenames");
        logger.info(i + " valid pagenames (" + ((i / i2) * 100.0d) + "%) not resolved to an existing title and written to " + logger + ".");
        LOGGER.info("Producing array of ranks...");
        int i3 = 0;
        double[] dArr = new double[readLines.size()];
        int size = readLines.size();
        while (true) {
            int i4 = size;
            size--;
            if (i4 == 0) {
                LOGGER.info(i3 + " ranked pages (" + ((i3 / readLines.size()) * 100.0d) + "%) had zero pagecount.");
                LOGGER.info("Saving scores to file " + parse.getString("rank") + "...");
                BinIO.storeDoubles(dArr, parse.getString("rank"));
                LOGGER.info("Done.");
                return;
            }
            dArr[size] = long2LongOpenHashMap.get(keyOfTitle((String) readLines.get(size)));
            if (dArr[size] == 0.0d) {
                i3++;
            }
        }
    }
}
