package org.carrot2.text.preprocessing;

import com.carrotsearch.hppc.ByteArrayList;
import com.carrotsearch.hppc.IntArrayList;
import com.carrotsearch.hppc.ShortArrayList;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.lang3.StringUtils;
import org.carrot2.core.Document;
import org.carrot2.core.ProcessingException;
import org.carrot2.core.attribute.Init;
import org.carrot2.shaded.guava.common.collect.Lists;
import org.carrot2.text.analysis.ITokenizer;
import org.carrot2.text.util.MutableCharArray;
import org.carrot2.util.CharArrayUtils;
import org.carrot2.util.ExceptionUtils;
import org.carrot2.util.attribute.Attribute;
import org.carrot2.util.attribute.AttributeLevel;
import org.carrot2.util.attribute.Bindable;
import org.carrot2.util.attribute.DefaultGroups;
import org.carrot2.util.attribute.Group;
import org.carrot2.util.attribute.Input;
import org.carrot2.util.attribute.Label;
import org.carrot2.util.attribute.Level;

@Bindable(prefix = "Tokenizer")
/* loaded from: input_file:org/carrot2/text/preprocessing/Tokenizer.class */
public final class Tokenizer {

    @Level(AttributeLevel.ADVANCED)
    @Init
    @Group(DefaultGroups.PREPROCESSING)
    @Input
    @Attribute
    @Label("Document fields")
    public Collection<String> documentFields = Arrays.asList(Document.TITLE, Document.SUMMARY);
    private ArrayList<char[]> images;
    private ShortArrayList tokenTypes;
    private IntArrayList documentIndices;
    private ByteArrayList fieldIndices;

    public void tokenize(PreprocessingContext preprocessingContext) {
        short nextToken;
        List<Document> list = preprocessingContext.documents;
        String[] strArr = (String[]) this.documentFields.toArray(new String[this.documentFields.size()]);
        if (strArr.length > 8) {
            throw new ProcessingException("Maximum number of tokenized fields is 8.");
        }
        this.images = Lists.newArrayList();
        this.tokenTypes = new ShortArrayList();
        this.documentIndices = new IntArrayList();
        this.fieldIndices = new ByteArrayList();
        Iterator<Document> it = list.iterator();
        int i = 0;
        ITokenizer tokenizer = preprocessingContext.language.getTokenizer();
        MutableCharArray mutableCharArray = new MutableCharArray(CharArrayUtils.EMPTY_ARRAY);
        while (it.hasNext()) {
            Document next = it.next();
            boolean z = false;
            for (int i2 = 0; i2 < strArr.length; i2++) {
                byte b = (byte) i2;
                String str = (String) next.getField(strArr[i2]);
                if (!StringUtils.isEmpty(str)) {
                    try {
                        tokenizer.reset(new StringReader(str));
                        short nextToken2 = tokenizer.nextToken();
                        short s = nextToken2;
                        if (nextToken2 != -1) {
                            if (z) {
                                addFieldSeparator(i);
                            }
                            do {
                                tokenizer.setTermBuffer(mutableCharArray);
                                add(i, b, preprocessingContext.intern(mutableCharArray), s);
                                nextToken = tokenizer.nextToken();
                                s = nextToken;
                            } while (nextToken != -1);
                            z = true;
                        }
                    } catch (IOException e) {
                        throw ExceptionUtils.wrapAsRuntimeException(e);
                    }
                }
            }
            if (it.hasNext()) {
                addDocumentSeparator();
            }
            i++;
        }
        addTerminator();
        preprocessingContext.allTokens.documentIndex = this.documentIndices.toArray();
        preprocessingContext.allTokens.fieldIndex = this.fieldIndices.toArray();
        preprocessingContext.allTokens.image = (char[][]) this.images.toArray((Object[]) new char[this.images.size()]);
        preprocessingContext.allTokens.type = this.tokenTypes.toArray();
        preprocessingContext.allFields.name = strArr;
        this.images = null;
        this.fieldIndices = null;
        this.tokenTypes = null;
        this.documentIndices = null;
    }

    void addTerminator() {
        add(-1, (byte) -1, null, (short) 2048);
    }

    void addDocumentSeparator() {
        add(-1, (byte) -1, null, (short) 512);
    }

    void addFieldSeparator(int i) {
        add(i, (byte) -1, null, (short) 1024);
    }

    void addSentenceSeparator(int i, byte b) {
        add(i, b, null, (short) 1024);
    }

    void add(int i, byte b, char[] cArr, short s) {
        this.documentIndices.add(i);
        this.fieldIndices.add(b);
        this.images.add(cArr);
        this.tokenTypes.add(s);
    }
}
