package org.grobid.core.engines;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.StringTokenizer;
import org.apache.commons.io.FileUtils;
import org.apache.xml.serialize.LineSeparator;
import org.grobid.core.GrobidModels;
import org.grobid.core.document.Document;
import org.grobid.core.document.DocumentSource;
import org.grobid.core.engines.tagging.GenericTaggerUtils;
import org.grobid.core.exceptions.GrobidException;
import org.grobid.core.features.FeatureFactory;
import org.grobid.core.features.FeaturesVectorSegmentation;
import org.grobid.core.layout.Block;
import org.grobid.core.layout.LayoutToken;
import org.grobid.core.utilities.LanguageUtilities;
import org.grobid.core.utilities.TextUtilities;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:WEB-INF/lib/grobid-core-0.3.4.jar:org/grobid/core/engines/Segmentation.class */
public class Segmentation extends AbstractParser {
    private static final Logger LOGGER = LoggerFactory.getLogger(Segmentation.class);
    private LanguageUtilities languageUtilities;
    private static final int NBBINS = 12;
    private static final int LINESCALE = 10;

    public Segmentation() {
        super(GrobidModels.SEGMENTATION);
        this.languageUtilities = LanguageUtilities.getInstance();
    }

    public Document processing(String str) {
        return processing(str, null);
    }

    public Document processing(String str, String str2) {
        return processing(str, str2, -1, -1);
    }

    /* JADX WARN: Removed duplicated region for block: B:74:0x025c  */
    /* JADX WARN: Removed duplicated region for block: B:77:0x0265  */
    /*
        Code decompiled incorrectly, please refer to instructions dump.
        To view partially-correct add '--show-bad-code' argument
    */
    public org.grobid.core.document.Document processing(java.lang.String r7, java.lang.String r8, int r9, int r10) {
        /*
            Method dump skipped, instructions count: 622
            To view this dump add '--comments-level debug' option
        */
        throw new UnsupportedOperationException("Method not decompiled: org.grobid.core.engines.Segmentation.processing(java.lang.String, java.lang.String, int, int):org.grobid.core.document.Document");
    }

    public static String getAllLinesFeatured(Document document) {
        FeatureFactory featureFactory = FeatureFactory.getInstance();
        StringBuilder sb = new StringBuilder();
        String str = null;
        int i = -1;
        List<Block> blocks = document.getBlocks();
        if (blocks == null || blocks.size() == 0) {
            return null;
        }
        FeaturesVectorSegmentation featuresVectorSegmentation = null;
        boolean z = true;
        boolean z2 = true;
        boolean z3 = true;
        int i2 = 0;
        int i3 = 0;
        int i4 = 0;
        int i5 = 0;
        new ArrayList();
        document.getTokenizations();
        Iterator<Block> it = blocks.iterator();
        while (it.hasNext()) {
            List<LayoutToken> tokens = it.next().getTokens();
            if (tokens != null) {
                i4 += tokens.size();
            }
        }
        double d = 0.0d;
        for (int i6 = 0; i6 < blocks.size(); i6++) {
            Block block = blocks.get(i6);
            if (z3 || z) {
                i5 = 0;
                double d2 = 0.0d;
                double d3 = 1000000.0d;
                int i7 = i6;
                while (true) {
                    if (i7 >= blocks.size() || 0 != 0) {
                        break;
                    }
                    String text = blocks.get(i7).getText();
                    if (text != null) {
                        if (text.contains("@PAGE")) {
                            if (i5 > 0) {
                                if (blocks.get(i7).getTokens() != null) {
                                    i5 += blocks.get(i7).getTokens().size();
                                    if (blocks.get(i7).getY() != 0.0d && blocks.get(i7).getY() < d3) {
                                        d3 = blocks.get(i7).getY();
                                    }
                                    if (blocks.get(i7).getY() != 0.0d && blocks.get(i7).getY() > d2) {
                                        d2 = blocks.get(i7).getY();
                                    }
                                }
                            } else if (blocks.get(i7).getTokens() != null) {
                                if (blocks.get(i7).getY() != 0.0d && blocks.get(i7).getY() < d3) {
                                    d3 = blocks.get(i7).getY();
                                }
                                if (blocks.get(i7).getY() != 0.0d && blocks.get(i7).getY() > d2) {
                                    d2 = blocks.get(i7).getY();
                                }
                            }
                        } else if (blocks.get(i7).getTokens() != null) {
                            i5 += blocks.get(i7).getTokens().size();
                            LayoutToken layoutToken = blocks.get(i7).getTokens().get(0);
                            LayoutToken layoutToken2 = blocks.get(i7).getTokens().get(blocks.get(i7).getTokens().size() - 1);
                            if (layoutToken.getY() != 0.0d && layoutToken.getY() < d3) {
                                d3 = layoutToken.getY();
                            }
                            if (layoutToken.getY() != 0.0d && layoutToken.getY() > d2) {
                                d2 = layoutToken.getY();
                            }
                            if (layoutToken2.getY() != 0.0d && layoutToken2.getY() > d2) {
                                d2 = layoutToken2.getY();
                            }
                        }
                    }
                    i7++;
                }
                d = d2 - d3;
            }
            if (z3) {
                z2 = true;
                z3 = false;
            }
            if (z) {
                z2 = true;
                i2 = 0;
            }
            String text2 = block.getText();
            if (text2 != null) {
                if (text2.contains("@PAGE")) {
                    i2 = 0;
                    z = true;
                    z2 = false;
                } else {
                    z = false;
                }
            }
            String[] split = text2.split("[\\n\\r]");
            int i8 = 0;
            for (int i9 = 0; i9 < split.length; i9++) {
                if (split[i9].length() > i8) {
                    i8 = split[i9].length();
                }
            }
            List<LayoutToken> tokens2 = block.getTokens();
            for (int i10 = 0; i10 < split.length; i10++) {
                String str2 = split[i10];
                boolean z4 = z2;
                boolean z5 = z;
                if (tokens2 != null && tokens2.size() != 0) {
                    LayoutToken layoutToken3 = tokens2.size() > 0 ? tokens2.get(0) : null;
                    double y = layoutToken3.getY();
                    FeaturesVectorSegmentation featuresVectorSegmentation2 = new FeaturesVectorSegmentation();
                    featuresVectorSegmentation2.token = layoutToken3;
                    featuresVectorSegmentation2.line = str2;
                    StringTokenizer stringTokenizer = new StringTokenizer(str2, " \t");
                    String nextToken = stringTokenizer.hasMoreTokens() ? stringTokenizer.nextToken() : null;
                    String nextToken2 = stringTokenizer.hasMoreTokens() ? stringTokenizer.nextToken() : null;
                    if (nextToken != null && nextToken.trim().length() != 0 && !nextToken.trim().equals("\n") && !nextToken.trim().equals(LineSeparator.Macintosh) && !nextToken.trim().equals("\n\r") && !TextUtilities.filterLine(str2)) {
                        String trim = nextToken.trim();
                        featuresVectorSegmentation2.string = trim;
                        featuresVectorSegmentation2.secondString = nextToken2;
                        featuresVectorSegmentation2.firstPageBlock = z4;
                        featuresVectorSegmentation2.lastPageBlock = z5;
                        featuresVectorSegmentation2.lineLength = featureFactory.relativeLocation(str2.length(), i8, 10);
                        featuresVectorSegmentation2.punctuationProfile = TextUtilities.punctuationProfile(str2);
                        featuresVectorSegmentation2.lineStatus = null;
                        featuresVectorSegmentation2.punctType = null;
                        if (i10 == 0 || (featuresVectorSegmentation != null && featuresVectorSegmentation.blockStatus.equals("BLOCKEND"))) {
                            featuresVectorSegmentation2.blockStatus = "BLOCKSTART";
                        } else if (i10 == split.length - 1) {
                            featuresVectorSegmentation2.blockStatus = "BLOCKEND";
                        } else if (featuresVectorSegmentation2.blockStatus == null) {
                            featuresVectorSegmentation2.blockStatus = "BLOCKIN";
                        }
                        if (z2) {
                            featuresVectorSegmentation2.pageStatus = "PAGESTART";
                            z2 = false;
                            z = false;
                            if (featuresVectorSegmentation != null) {
                                featuresVectorSegmentation.pageStatus = "PAGEEND";
                            }
                        } else {
                            featuresVectorSegmentation2.pageStatus = "PAGEIN";
                            z2 = false;
                            z = false;
                        }
                        if (trim.length() == 1) {
                            featuresVectorSegmentation2.singleChar = true;
                        }
                        if (Character.isUpperCase(trim.charAt(0))) {
                            featuresVectorSegmentation2.capitalisation = "INITCAP";
                        }
                        if (featureFactory.test_all_capital(trim)) {
                            featuresVectorSegmentation2.capitalisation = "ALLCAP";
                        }
                        if (featureFactory.test_digit(trim)) {
                            featuresVectorSegmentation2.digit = "CONTAINSDIGITS";
                        }
                        if (featureFactory.test_common(trim)) {
                            featuresVectorSegmentation2.commonName = true;
                        }
                        if (featureFactory.test_names(trim)) {
                            featuresVectorSegmentation2.properName = true;
                        }
                        if (featureFactory.test_month(trim)) {
                            featuresVectorSegmentation2.month = true;
                        }
                        if (featureFactory.isDigit.matcher(trim).find()) {
                            featuresVectorSegmentation2.digit = "ALLDIGIT";
                        }
                        if (featureFactory.YEAR.matcher(trim).find()) {
                            featuresVectorSegmentation2.year = true;
                        }
                        if (featureFactory.EMAIL.matcher(trim).find()) {
                            featuresVectorSegmentation2.email = true;
                        }
                        if (featureFactory.HTTP.matcher(trim).find()) {
                            featuresVectorSegmentation2.http = true;
                        }
                        if (str == null) {
                            str = layoutToken3.getFont();
                            featuresVectorSegmentation2.fontStatus = "NEWFONT";
                        } else if (str.equals(layoutToken3.getFont())) {
                            featuresVectorSegmentation2.fontStatus = "SAMEFONT";
                        } else {
                            str = layoutToken3.getFont();
                            featuresVectorSegmentation2.fontStatus = "NEWFONT";
                        }
                        int fontSize = (int) layoutToken3.getFontSize();
                        if (i == -1) {
                            i = fontSize;
                            featuresVectorSegmentation2.fontSize = "HIGHERFONT";
                        } else if (i == fontSize) {
                            featuresVectorSegmentation2.fontSize = "SAMEFONTSIZE";
                        } else if (i < fontSize) {
                            featuresVectorSegmentation2.fontSize = "HIGHERFONT";
                            i = fontSize;
                        } else if (i > fontSize) {
                            featuresVectorSegmentation2.fontSize = "LOWERFONT";
                            i = fontSize;
                        }
                        if (layoutToken3.getBold()) {
                            featuresVectorSegmentation2.bold = true;
                        }
                        if (layoutToken3.getItalic()) {
                            featuresVectorSegmentation2.italic = true;
                        }
                        if (featuresVectorSegmentation2.capitalisation == null) {
                            featuresVectorSegmentation2.capitalisation = "NOCAPS";
                        }
                        if (featuresVectorSegmentation2.digit == null) {
                            featuresVectorSegmentation2.digit = "NODIGIT";
                        }
                        featuresVectorSegmentation2.relativeDocumentPosition = featureFactory.relativeLocation(i3, i4, 12);
                        featuresVectorSegmentation2.relativePagePositionChar = featureFactory.relativeLocation(i2, i5, 12);
                        int relativeLocation = featureFactory.relativeLocation(y, d, 12);
                        if (relativeLocation > 12) {
                            relativeLocation = 12;
                        }
                        featuresVectorSegmentation2.relativePagePosition = relativeLocation;
                        if (featuresVectorSegmentation != null) {
                            sb.append(featuresVectorSegmentation.printVector());
                        }
                        featuresVectorSegmentation = featuresVectorSegmentation2;
                    }
                }
            }
            if (tokens2 != null) {
                i2 += tokens2.size();
                i3 += tokens2.size();
            }
        }
        if (featuresVectorSegmentation != null) {
            sb.append(featuresVectorSegmentation.printVector());
        }
        return sb.toString();
    }

    public void createTrainingSegmentation(String str, String str2, String str3, int i) {
        try {
            try {
                File file = new File(str);
                DocumentSource fromPdfWithImages = DocumentSource.fromPdfWithImages(file, -1, -1);
                Document document = new Document(fromPdfWithImages);
                String name = file.getName();
                document.addTokenizedDocument();
                if (document.getBlocks() == null) {
                    throw new Exception("PDF parsing resulted in empty content");
                }
                String allLinesFeatured = getAllLinesFeatured(document);
                List<String> tokenizationsFulltext = document.getTokenizationsFulltext();
                OutputStreamWriter outputStreamWriter = new OutputStreamWriter(new FileOutputStream(new File(str2 + "/" + name.replace(".pdf", ".training.segmentation")), false), "UTF-8");
                outputStreamWriter.write(allLinesFeatured + "\n");
                outputStreamWriter.close();
                StringBuffer stringBuffer = new StringBuffer();
                Iterator<String> it = tokenizationsFulltext.iterator();
                while (it.hasNext()) {
                    stringBuffer.append(it.next());
                }
                FileUtils.writeStringToFile(new File(str2 + "/" + name.replace(".pdf", ".training.segmentation.rawtxt")), stringBuffer.toString(), "UTF-8");
                if (allLinesFeatured != null && allLinesFeatured.length() > 0) {
                    StringBuffer trainingExtraction = trainingExtraction(label(allLinesFeatured), tokenizationsFulltext, document);
                    OutputStreamWriter outputStreamWriter2 = new OutputStreamWriter(new FileOutputStream(new File(str3 + "/" + name.replace(".pdf", ".training.segmentation.tei.xml")), false), "UTF-8");
                    outputStreamWriter2.write("<?xml version=\"1.0\" ?>\n<tei>\n\t<teiHeader>\n\t\t<fileDesc xml:id=\"" + i + "\"/>\n\t</teiHeader>\n\t<text xml:lang=\"en\">\n");
                    outputStreamWriter2.write(trainingExtraction.toString());
                    outputStreamWriter2.write("\n\t</text>\n</tei>\n");
                    outputStreamWriter2.close();
                }
                DocumentSource.close(fromPdfWithImages, true);
            } catch (Exception e) {
                e.printStackTrace();
                throw new GrobidException("An exception occured while running Grobid training data generation for segmentation model.", e);
            }
        } catch (Throwable th) {
            DocumentSource.close(null, true);
            throw th;
        }
    }

    private StringBuffer trainingExtraction(String str, List<String> list, Document document) {
        StringBuffer stringBuffer = new StringBuffer();
        try {
            List<Block> blocks = document.getBlocks();
            int i = 0;
            int i2 = 0;
            StringTokenizer stringTokenizer = new StringTokenizer(str, "\n");
            String str2 = null;
            String str3 = null;
            String str4 = null;
            boolean z = true;
            while (stringTokenizer.hasMoreTokens()) {
                String trim = stringTokenizer.nextToken().trim();
                if (trim.length() != 0) {
                    StringTokenizer stringTokenizer2 = new StringTokenizer(trim, " \t");
                    ArrayList arrayList = new ArrayList();
                    int i3 = 0;
                    int countTokens = stringTokenizer2.countTokens();
                    while (stringTokenizer2.hasMoreTokens()) {
                        String trim2 = stringTokenizer2.nextToken().trim();
                        if (i3 == 0) {
                            str3 = TextUtilities.HTMLEncode(trim2);
                        } else if (i3 == 1) {
                            TextUtilities.HTMLEncode(trim2);
                        } else if (i3 == countTokens - 1) {
                            str2 = trim2;
                        } else {
                            arrayList.add(trim2);
                        }
                        i3++;
                    }
                    String str5 = null;
                    while (str5 == null && i < blocks.size()) {
                        Block block = blocks.get(i);
                        if (block.getTokens() == null) {
                            i++;
                            i2 = 0;
                        } else {
                            String text = block.getText();
                            if (text == null || text.trim().length() == 0) {
                                i++;
                                i2 = 0;
                            } else {
                                String[] split = text.split("[\\n\\r]");
                                if (split.length == 0 || i2 >= split.length) {
                                    i++;
                                    i2 = 0;
                                } else {
                                    str5 = split[i2];
                                    i2++;
                                    if (str5.trim().length() == 0) {
                                        str5 = null;
                                    } else if (TextUtilities.filterLine(str5)) {
                                        str5 = null;
                                    }
                                }
                            }
                        }
                    }
                    String HTMLEncode = TextUtilities.HTMLEncode(str5);
                    if (1 != 0 && !z) {
                        stringBuffer.append("<lb/>");
                    }
                    String str6 = null;
                    if (str4 != null) {
                        str6 = str4.startsWith(GenericTaggerUtils.START_ENTITY_LABEL_PREFIX) ? str4.substring(2, str4.length()) : str4;
                    }
                    String str7 = null;
                    if (str2 != null) {
                        str7 = str2.startsWith(GenericTaggerUtils.START_ENTITY_LABEL_PREFIX) ? str2.substring(2, str2.length()) : str2;
                    }
                    if (str4 != null) {
                        testClosingTag(stringBuffer, str7, str6, str2);
                    }
                    boolean writeField = writeField(stringBuffer, HTMLEncode, str2, str6, str3, "<header>", "<front>", false, 3);
                    if (!writeField) {
                        writeField = writeField(stringBuffer, HTMLEncode, str2, str6, str3, "<headnote>", "<note place=\"headnote\">", false, 3);
                    }
                    if (!writeField) {
                        writeField = writeField(stringBuffer, HTMLEncode, str2, str6, str3, "<footnote>", "<note place=\"footnote\">", false, 3);
                    }
                    if (!writeField) {
                        writeField = writeField(stringBuffer, HTMLEncode, str2, str6, str3, "<page>", "<page>", false, 3);
                    }
                    if (!writeField) {
                        writeField = writeField(stringBuffer, HTMLEncode, str2, str6, str3, "<references>", "<listBibl>", false, 3);
                    }
                    if (!writeField) {
                        writeField = writeField(stringBuffer, HTMLEncode, str2, str6, str3, "<body>", "<body>", false, 3);
                    }
                    if (!writeField) {
                        writeField = writeField(stringBuffer, HTMLEncode, str2, str6, str3, "<cover>", "<titlePage>", false, 3);
                    }
                    if (!writeField) {
                        writeField = writeField(stringBuffer, HTMLEncode, str2, str6, str3, "<annex>", "<div type=\"annex\">", false, 3);
                    }
                    if (!writeField) {
                        writeField(stringBuffer, HTMLEncode, str2, str6, str3, "<acknowledgement>", "<div type=\"acknowledgement\">", false, 3);
                    }
                    str4 = str2;
                    if (!stringTokenizer.hasMoreTokens() && str4 != null) {
                        testClosingTag(stringBuffer, "", str7, str2);
                    }
                    if (z) {
                        z = false;
                    }
                }
            }
            return stringBuffer;
        } catch (Exception e) {
            throw new GrobidException("An exception occured while running Grobid.", e);
        }
    }

    private boolean writeField(StringBuffer stringBuffer, String str, String str2, String str3, String str4, String str5, String str6, boolean z, int i) {
        boolean z2 = false;
        if (str2.equals(str5) || str2.equals(GenericTaggerUtils.START_ENTITY_LABEL_PREFIX + str5)) {
            z2 = true;
            String replace = str.replace("@BULLET", "•");
            if (str2.equals(str3) || str2.equals(GenericTaggerUtils.START_ENTITY_LABEL_PREFIX + str3)) {
                stringBuffer.append(replace);
            } else if (str3 == null) {
                for (int i2 = 0; i2 < i; i2++) {
                    stringBuffer.append("\t");
                }
                stringBuffer.append(str6).append(replace);
            } else if (str3.equals("<titlePage>")) {
                stringBuffer.append(replace);
            } else {
                for (int i3 = 0; i3 < i; i3++) {
                    stringBuffer.append("\t");
                }
                stringBuffer.append(str6).append(replace);
            }
        }
        return z2;
    }

    private boolean testClosingTag(StringBuffer stringBuffer, String str, String str2, String str3) {
        boolean z = false;
        if (!str.equals(str2)) {
            z = false;
            if (str2.equals("<header>")) {
                stringBuffer.append("</front>\n\n");
            } else if (str2.equals("<body>")) {
                stringBuffer.append("</body>\n\n");
            } else if (str2.equals("<headnote>")) {
                stringBuffer.append("</note>\n\n");
            } else if (str2.equals("<footnote>")) {
                stringBuffer.append("</note>\n\n");
            } else if (str2.equals("<references>")) {
                stringBuffer.append("</listBibl>\n\n");
                z = true;
            } else if (str2.equals("<page>")) {
                stringBuffer.append("</page>\n\n");
            } else if (str2.equals("<cover>")) {
                stringBuffer.append("</titlePage>\n\n");
            } else if (str2.equals("<annex>")) {
                stringBuffer.append("</div>\n\n");
            } else if (str2.equals("<acknowledgement>")) {
                stringBuffer.append("</div>\n\n");
            } else {
                z = false;
            }
        }
        return z;
    }

    @Override // org.grobid.core.engines.AbstractParser, java.io.Closeable, java.lang.AutoCloseable
    public void close() throws IOException {
        super.close();
    }
}
