package weka.core;

import at.tugraz.ist.spreadsheet.abstraction.location.Coordinates;
import at.tugraz.ist.spreadsheet.analysis.corpus.output.MetricListWriter;
import at.tugraz.ist.spreadsheet.analysis.faultextraction.FaultExtractionEngine;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.io.Serializable;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.Vector;
import org.apache.commons.lang3.StringUtils;
import weka.classifiers.lazy.kstar.KStarConstants;
import weka.core.stemmers.NullStemmer;
import weka.core.stemmers.Stemmer;
import weka.core.stopwords.Null;
import weka.core.stopwords.StopwordsHandler;
import weka.core.tokenizers.Tokenizer;
import weka.core.tokenizers.WordTokenizer;
import weka.gui.ProgrammaticProperty;

/* loaded from: input_file:weka/core/DictionaryBuilder.class */
public class DictionaryBuilder implements Aggregateable<DictionaryBuilder>, OptionHandler, Serializable {
    private static final long serialVersionUID = 5579506627960356012L;
    protected Instances m_inputFormat;
    protected Instances m_outputFormat;
    protected Map<String, int[]>[] m_dictsPerClass;
    protected Map<String, int[]> m_consolidatedDict;
    protected transient Map<String, int[]> m_inputVector;
    protected boolean m_doNotOperateOnPerClassBasis;
    protected boolean m_outputCounts;
    protected boolean m_lowerCaseTokens;
    protected long m_periodicPruneRate;
    protected boolean m_TFTransform;
    protected boolean m_IDFTransform;
    protected boolean m_normalize;
    protected double m_docLengthSum;
    protected double m_avgDocLength;
    protected boolean m_sortDictionary;
    protected boolean m_inputContainsStringAttributes;
    protected Stemmer m_stemmer = new NullStemmer();
    protected StopwordsHandler m_stopwordsHandler = new Null();
    protected int m_wordsToKeep = 1000;
    protected int m_minFrequency = 1;
    protected int m_count = 0;
    protected Tokenizer m_tokenizer = new WordTokenizer();
    protected Range m_selectedRange = new Range("first-last");
    protected int m_classIndex = -1;
    protected int m_numClasses = 1;
    protected String m_Prefix = "";

    @ProgrammaticProperty
    public void setAverageDocLength(double d) {
        this.m_avgDocLength = d;
    }

    public double getAverageDocLength() {
        return this.m_avgDocLength;
    }

    public String sortDictionaryTipText() {
        return "Sort the dictionary alphabetically";
    }

    public void setSortDictionary(boolean z) {
        this.m_sortDictionary = z;
    }

    public boolean getSortDictionary() {
        return this.m_sortDictionary;
    }

    public boolean getOutputWordCounts() {
        return this.m_outputCounts;
    }

    public void setOutputWordCounts(boolean z) {
        this.m_outputCounts = z;
    }

    public String outputWordCountsTipText() {
        return "Output word counts rather than boolean 0 or 1(indicating presence or absence of a word).";
    }

    public Range getSelectedRange() {
        return this.m_selectedRange;
    }

    public void setSelectedRange(String str) {
        this.m_selectedRange = new Range(str);
    }

    public String attributeIndicesTipText() {
        return "Specify range of attributes to act on. This is a comma separated list of attribute indices, with \"first\" and \"last\" valid values. Specify an inclusive range with \"-\". E.g: \"first-3,5,6-10,last\".";
    }

    public String getAttributeIndices() {
        return this.m_selectedRange.getRanges();
    }

    public void setAttributeIndices(String str) {
        this.m_selectedRange.setRanges(str);
    }

    public void setAttributeIndicesArray(int[] iArr) {
        setAttributeIndices(Range.indicesToRangeList(iArr));
    }

    public String invertSelectionTipText() {
        return "Set attribute selection mode. If false, only selected attributes in the range will be worked on; if true, only non-selected attributes will be processed.";
    }

    public boolean getInvertSelection() {
        return this.m_selectedRange.getInvert();
    }

    public void setInvertSelection(boolean z) {
        this.m_selectedRange.setInvert(z);
    }

    public int getWordsToKeep() {
        return this.m_wordsToKeep;
    }

    public void setWordsToKeep(int i) {
        this.m_wordsToKeep = i;
    }

    public String wordsToKeepTipText() {
        return "The number of words (per class if there is a class attribute assigned) to attempt to keep.";
    }

    public long getPeriodicPruning() {
        return this.m_periodicPruneRate;
    }

    public void setPeriodicPruning(long j) {
        this.m_periodicPruneRate = j;
    }

    public String periodicPruningTipText() {
        return "Specify the rate (x% of the input dataset) at which to periodically prune the dictionary. wordsToKeep prunes after creating a full dictionary. You may not have enough memory for this approach.";
    }

    public boolean getTFTransform() {
        return this.m_TFTransform;
    }

    public void setTFTransform(boolean z) {
        this.m_TFTransform = z;
    }

    public String TFTransformTipText() {
        return "Sets whether if the word frequencies should be transformed into:\n    log(1+fij) \n       where fij is the frequency of word i in document (instance) j.";
    }

    public String getAttributeNamePrefix() {
        return this.m_Prefix;
    }

    public void setAttributeNamePrefix(String str) {
        this.m_Prefix = str;
    }

    public String attributeNamePrefixTipText() {
        return "Prefix for the created attribute names. (default: \"\")";
    }

    public boolean getIDFTransform() {
        return this.m_IDFTransform;
    }

    public void setIDFTransform(boolean z) {
        this.m_IDFTransform = z;
    }

    public String IDFTransformTipText() {
        return "Sets whether if the word frequencies in a document should be transformed into: \n   fij*log(num of Docs/num of Docs with word i) \n      where fij is the frequency of word i in document (instance) j.";
    }

    public boolean getNormalize() {
        return this.m_normalize;
    }

    public void setNormalize(boolean z) {
        this.m_normalize = z;
    }

    public String normalizeTipText() {
        return "Whether word frequencies for a document (instance) should be normalized or not";
    }

    public String normalizeDocLengthTipText() {
        return "Sets whether if the word frequencies for a document (instance) should be normalized or not.";
    }

    public boolean getLowerCaseTokens() {
        return this.m_lowerCaseTokens;
    }

    public void setLowerCaseTokens(boolean z) {
        this.m_lowerCaseTokens = z;
    }

    public String lowerCaseTokensTipText() {
        return "If set then all the word tokens are converted to lower case before being added to the dictionary.";
    }

    public String doNotOperateOnPerClassBasisTipText() {
        return "If this is set, the maximum number of words and the minimum term frequency is not enforced on a per-class basis but based on the documents in all the classes (even if a class attribute is set).";
    }

    public boolean getDoNotOperateOnPerClassBasis() {
        return this.m_doNotOperateOnPerClassBasis;
    }

    public void setDoNotOperateOnPerClassBasis(boolean z) {
        this.m_doNotOperateOnPerClassBasis = z;
    }

    public String minTermFreqTipText() {
        return "Sets the minimum term frequency. This is enforced on a per-class basis.";
    }

    public int getMinTermFreq() {
        return this.m_minFrequency;
    }

    public void setMinTermFreq(int i) {
        this.m_minFrequency = i;
    }

    public Stemmer getStemmer() {
        return this.m_stemmer;
    }

    public void setStemmer(Stemmer stemmer) {
        if (stemmer != null) {
            this.m_stemmer = stemmer;
        } else {
            this.m_stemmer = new NullStemmer();
        }
    }

    public String stemmerTipText() {
        return "The stemming algorithm to use on the words.";
    }

    public StopwordsHandler getStopwordsHandler() {
        return this.m_stopwordsHandler;
    }

    public void setStopwordsHandler(StopwordsHandler stopwordsHandler) {
        if (stopwordsHandler != null) {
            this.m_stopwordsHandler = stopwordsHandler;
        } else {
            this.m_stopwordsHandler = new Null();
        }
    }

    public String stopwordsHandlerTipText() {
        return "The stopwords handler to use (Null means no stopwords are used).";
    }

    public Tokenizer getTokenizer() {
        return this.m_tokenizer;
    }

    public void setTokenizer(Tokenizer tokenizer) {
        this.m_tokenizer = tokenizer;
    }

    public String tokenizerTipText() {
        return "The tokenizing algorithm to use on the strings.";
    }

    @Override // weka.core.OptionHandler
    public Enumeration<Option> listOptions() {
        Vector vector = new Vector();
        vector.addElement(new Option("\tOutput word counts rather than boolean word presence.\n", Coordinates.R1C1_COLUMN_DELIMITER, 0, "-C"));
        vector.addElement(new Option("\tSpecify list of string attributes to convert to words (as weka Range).\n\t(default: select all string attributes)", Coordinates.R1C1_ROW_DELIMITER, 1, "-R <index1,index2-index4,...>"));
        vector.addElement(new Option("\tInvert matching sense of column indexes.", "V", 0, "-V"));
        vector.addElement(new Option("\tSpecify a prefix for the created attribute names.\n\t(default: \"\")", "P", 1, "-P <attribute name prefix>"));
        vector.addElement(new Option("\tSpecify approximate number of word fields to create.\n\tSurplus words will be discarded..\n\t(default: 1000)", "W", 1, "-W <number of words to keep>"));
        vector.addElement(new Option("\tSpecify the rate (e.g., every x instances) at which to periodically prune the dictionary.\n\t-W prunes after creating a full dictionary. You may not have enough memory for this approach.\n\t(default: no periodic pruning)", "prune-rate", 1, "-prune-rate <every x instances>"));
        vector.addElement(new Option("\tTransform the word frequencies into log(1+fij)\n\twhere fij is the frequency of word i in jth document(instance).\n", "T", 0, "-T"));
        vector.addElement(new Option("\tTransform each word frequency into:\n\tfij*log(num of Documents/num of documents containing word i)\n\t  where fij if frequency of word i in jth document(instance)", "I", 0, "-I"));
        vector.addElement(new Option("\tWhether to 0=not normalize/1=normalize all data/2=normalize test data only\n\tto average length of training documents (default 0=don't normalize).", "N", 1, "-N"));
        vector.addElement(new Option("\tConvert all tokens to lowercase before adding to the dictionary.", "L", 0, "-L"));
        vector.addElement(new Option("\tThe stopwords handler to use (default Null).", "-stopwords-handler", 1, "-stopwords-handler"));
        vector.addElement(new Option("\tThe stemming algorithm (classname plus parameters) to use.", "stemmer", 1, "-stemmer <spec>"));
        vector.addElement(new Option("\tThe minimum term frequency (default = 1).", "M", 1, "-M <int>"));
        vector.addElement(new Option("\tIf this is set, the maximum number of words and the \n\tminimum term frequency is not enforced on a per-class \n\tbasis but based on the documents in all the classes \n\t(even if a class attribute is set).", "O", 0, "-O"));
        vector.addElement(new Option("\tThe tokenizing algorihtm (classname plus parameters) to use.\n\t(default: " + WordTokenizer.class.getName() + ")", "tokenizer", 1, "-tokenizer <spec>"));
        return vector.elements();
    }

    @Override // weka.core.OptionHandler
    public String[] getOptions() {
        Vector vector = new Vector();
        vector.add("-R");
        vector.add(getSelectedRange().getRanges());
        if (getInvertSelection()) {
            vector.add("-V");
        }
        if (!"".equals(getAttributeNamePrefix())) {
            vector.add("-P");
            vector.add(getAttributeNamePrefix());
        }
        vector.add("-W");
        vector.add(String.valueOf(getWordsToKeep()));
        vector.add("-prune-rate");
        vector.add(String.valueOf(getPeriodicPruning()));
        if (getOutputWordCounts()) {
            vector.add("-C");
        }
        if (getTFTransform()) {
            vector.add("-T");
        }
        if (getIDFTransform()) {
            vector.add("-I");
        }
        if (getNormalize()) {
            vector.add("-N");
        }
        if (getLowerCaseTokens()) {
            vector.add("-L");
        }
        if (getStemmer() != null) {
            vector.add("-stemmer");
            String name = getStemmer().getClass().getName();
            if (getStemmer() instanceof OptionHandler) {
                name = name + " " + Utils.joinOptions(((OptionHandler) getStemmer()).getOptions());
            }
            vector.add(name.trim());
        }
        if (getStopwordsHandler() != null) {
            vector.add("-stopwords-handler");
            String name2 = getStopwordsHandler().getClass().getName();
            if (getStopwordsHandler() instanceof OptionHandler) {
                name2 = name2 + " " + Utils.joinOptions(((OptionHandler) getStopwordsHandler()).getOptions());
            }
            vector.add(name2.trim());
        }
        vector.add("-M");
        vector.add(String.valueOf(getMinTermFreq()));
        if (getDoNotOperateOnPerClassBasis()) {
            vector.add("-O");
        }
        vector.add("-tokenizer");
        vector.add((getTokenizer().getClass().getName() + " " + Utils.joinOptions(getTokenizer().getOptions())).trim());
        return (String[]) vector.toArray(new String[vector.size()]);
    }

    @Override // weka.core.OptionHandler
    public void setOptions(String[] strArr) throws Exception {
        String option = Utils.getOption('R', strArr);
        if (option.length() != 0) {
            setSelectedRange(option);
        } else {
            setSelectedRange("first-last");
        }
        setInvertSelection(Utils.getFlag('V', strArr));
        String option2 = Utils.getOption('P', strArr);
        if (option2.length() != 0) {
            setAttributeNamePrefix(option2);
        } else {
            setAttributeNamePrefix("");
        }
        String option3 = Utils.getOption('W', strArr);
        if (option3.length() != 0) {
            setWordsToKeep(Integer.valueOf(option3).intValue());
        } else {
            setWordsToKeep(1000);
        }
        if (Utils.getOption("prune-rate", strArr).length() > 0) {
            setPeriodicPruning(Integer.parseInt(r0));
        } else {
            setPeriodicPruning(-1L);
        }
        String option4 = Utils.getOption('M', strArr);
        if (option4.length() != 0) {
            setMinTermFreq(Integer.valueOf(option4).intValue());
        } else {
            setMinTermFreq(1);
        }
        setOutputWordCounts(Utils.getFlag('C', strArr));
        setTFTransform(Utils.getFlag('T', strArr));
        setIDFTransform(Utils.getFlag('I', strArr));
        setDoNotOperateOnPerClassBasis(Utils.getFlag('O', strArr));
        setNormalize(Utils.getFlag('N', strArr));
        setLowerCaseTokens(Utils.getFlag('L', strArr));
        String option5 = Utils.getOption("stemmer", strArr);
        if (option5.length() == 0) {
            setStemmer(null);
        } else {
            String[] splitOptions = Utils.splitOptions(option5);
            if (splitOptions.length == 0) {
                throw new Exception("Invalid stemmer specification string");
            }
            String str = splitOptions[0];
            splitOptions[0] = "";
            setStemmer((Stemmer) Utils.forName(Stemmer.class, str, splitOptions));
        }
        String option6 = Utils.getOption("stopwords-handler", strArr);
        if (option6.length() == 0) {
            setStopwordsHandler(null);
        } else {
            String[] splitOptions2 = Utils.splitOptions(option6);
            if (splitOptions2.length == 0) {
                throw new Exception("Invalid StopwordsHandler specification string");
            }
            String str2 = splitOptions2[0];
            splitOptions2[0] = "";
            setStopwordsHandler((StopwordsHandler) Utils.forName(StopwordsHandler.class, str2, splitOptions2));
        }
        String option7 = Utils.getOption("tokenizer", strArr);
        if (option7.length() == 0) {
            setTokenizer(new WordTokenizer());
        } else {
            String[] splitOptions3 = Utils.splitOptions(option7);
            if (splitOptions3.length == 0) {
                throw new Exception("Invalid tokenizer specification string");
            }
            String str3 = splitOptions3[0];
            splitOptions3[0] = "";
            setTokenizer((Tokenizer) Utils.forName(Tokenizer.class, str3, splitOptions3));
        }
        Utils.checkForRemainingOptions(strArr);
    }

    public void setup(Instances instances) throws Exception {
        this.m_inputContainsStringAttributes = instances.checkForStringAttributes();
        this.m_inputFormat = instances.stringFreeStructure();
        if (this.m_inputContainsStringAttributes) {
            this.m_numClasses = (this.m_doNotOperateOnPerClassBasis || this.m_inputFormat.classIndex() < 0 || !this.m_inputFormat.classAttribute().isNominal()) ? 1 : this.m_inputFormat.numClasses();
            this.m_dictsPerClass = this.m_sortDictionary ? new TreeMap[this.m_numClasses] : new LinkedHashMap[this.m_numClasses];
            this.m_classIndex = this.m_inputFormat.classIndex();
            for (int i = 0; i < this.m_numClasses; i++) {
                this.m_dictsPerClass[i] = this.m_sortDictionary ? new TreeMap<>() : new LinkedHashMap<>();
            }
            determineSelectedRange(instances);
        }
    }

    public Instances getInputFormat() {
        return this.m_inputFormat;
    }

    public boolean readyToVectorize() {
        return (this.m_inputFormat == null || this.m_consolidatedDict == null) ? false : true;
    }

    private void determineSelectedRange(Instances instances) {
        if (this.m_selectedRange == null) {
            StringBuffer stringBuffer = new StringBuffer();
            for (int i = 0; i < instances.numAttributes(); i++) {
                if (instances.attribute(i).type() == 2) {
                    stringBuffer.append((i + 1) + FaultExtractionEngine.EXPORT_CSV_DELIMITER);
                }
            }
            this.m_selectedRange = new Range(stringBuffer.toString());
        }
        this.m_selectedRange.setUpper(instances.numAttributes() - 1);
        StringBuffer stringBuffer2 = new StringBuffer();
        for (int i2 = 0; i2 < instances.numAttributes(); i2++) {
            if (this.m_selectedRange.isInRange(i2) && instances.attribute(i2).type() == 2) {
                stringBuffer2.append((i2 + 1) + FaultExtractionEngine.EXPORT_CSV_DELIMITER);
            }
        }
        this.m_selectedRange.setRanges(stringBuffer2.toString());
        this.m_selectedRange.setUpper(instances.numAttributes() - 1);
    }

    public Instances getVectorizedFormat() throws Exception {
        if (this.m_inputFormat == null) {
            throw new Exception("No input format available. Call setup() and make sure a dictionary has been built first.");
        }
        if (!this.m_inputContainsStringAttributes) {
            return this.m_inputFormat;
        }
        if (this.m_consolidatedDict == null) {
            throw new Exception("Dictionary hasn't been built or finalized yet!");
        }
        if (this.m_outputFormat != null) {
            return this.m_outputFormat;
        }
        ArrayList arrayList = new ArrayList();
        int i = -1;
        for (int i2 = 0; i2 < this.m_inputFormat.numAttributes(); i2++) {
            if (!this.m_selectedRange.isInRange(i2)) {
                if (this.m_inputFormat.classIndex() == i2) {
                    i = arrayList.size();
                }
                arrayList.add((Attribute) this.m_inputFormat.attribute(i2).copy());
            }
        }
        Iterator<Map.Entry<String, int[]>> it = this.m_consolidatedDict.entrySet().iterator();
        while (it.hasNext()) {
            arrayList.add(new Attribute(this.m_Prefix + it.next().getKey()));
        }
        Instances instances = new Instances(this.m_inputFormat.relationName(), (ArrayList<Attribute>) arrayList, 0);
        if (i >= 0) {
            instances.setClassIndex(i);
        }
        return instances;
    }

    public Instances vectorizeBatch(Instances instances, boolean z) throws Exception {
        if (this.m_inputFormat == null) {
            throw new Exception("No input format available. Call setup() and make sure a dictionary has been built first.");
        }
        if (!this.m_inputContainsStringAttributes) {
            return instances;
        }
        if (this.m_consolidatedDict == null) {
            throw new Exception("Dictionary hasn't been built or consolidated yet!");
        }
        Instances instances2 = new Instances(this.m_outputFormat, instances.numInstances());
        boolean z2 = this.m_normalize;
        if (z) {
            this.m_normalize = false;
        }
        if (instances.numInstances() > 0) {
            int[] iArr = new int[1];
            instances2.add(vectorizeInstance(instances.instance(0), iArr, true));
            for (int i = 1; i < instances.numInstances(); i++) {
                instances2.add(vectorizeInstance(instances.instance(i), iArr, true));
            }
            if (z) {
                this.m_avgDocLength = KStarConstants.FLOOR;
                for (int i2 = 0; i2 < instances2.numInstances(); i2++) {
                    Instance instance = instances2.instance(i2);
                    double d = 0.0d;
                    for (int i3 = 0; i3 < instance.numValues(); i3++) {
                        if (instance.index(i3) >= iArr[0]) {
                            d += instance.valueSparse(i3) * instance.valueSparse(i3);
                        }
                    }
                    this.m_avgDocLength += Math.sqrt(d);
                }
                this.m_avgDocLength /= instances2.numInstances();
                if (z2) {
                    for (int i4 = 0; i4 < instances2.numInstances(); i4++) {
                        normalizeInstance(instances2.instance(i4), iArr[0]);
                    }
                }
            }
        }
        this.m_normalize = z2;
        instances2.compactify();
        return instances2;
    }

    public Instance vectorizeInstance(Instance instance) throws Exception {
        return vectorizeInstance(instance, new int[1], false);
    }

    public Instance vectorizeInstance(Instance instance, boolean z) throws Exception {
        return vectorizeInstance(instance, new int[1], z);
    }

    private Instance vectorizeInstance(Instance instance, int[] iArr, boolean z) throws Exception {
        if (!this.m_inputContainsStringAttributes) {
            return instance;
        }
        if (this.m_inputFormat == null) {
            throw new Exception("No input format available. Call setup() and make sure a dictionary has been built first.");
        }
        if (this.m_consolidatedDict == null) {
            throw new Exception("Dictionary hasn't been built or consolidated yet!");
        }
        int i = 0;
        this.m_outputFormat.classIndex();
        TreeMap treeMap = new TreeMap();
        for (int i2 = 0; i2 < this.m_inputFormat.numAttributes(); i2++) {
            if (!this.m_selectedRange.isInRange(i2)) {
                if (this.m_inputFormat.attribute(i2).isString() || this.m_inputFormat.attribute(i2).isRelationValued()) {
                    if (instance.isMissing(i2)) {
                        treeMap.put(Integer.valueOf(i), new double[]{Utils.missingValue()});
                    } else if (this.m_inputFormat.attribute(i2).isString()) {
                        String stringValue = instance.stringValue(i2);
                        if (z) {
                            treeMap.put(Integer.valueOf(i), new double[]{this.m_outputFormat.attribute(i).addStringValue(stringValue)});
                        } else {
                            this.m_outputFormat.attribute(i).setStringValue(stringValue);
                            treeMap.put(Integer.valueOf(i), new double[]{KStarConstants.FLOOR});
                        }
                    } else {
                        if (this.m_outputFormat.attribute(i).numValues() == 0) {
                            this.m_outputFormat.attribute(i).addRelation(this.m_outputFormat.attribute(i).relation());
                        }
                        treeMap.put(Integer.valueOf(i), new double[]{this.m_outputFormat.attribute(i).addRelation(instance.relationalValue(i2))});
                    }
                } else if (instance.value(i2) != KStarConstants.FLOOR) {
                    treeMap.put(Integer.valueOf(i), new double[]{instance.value(i2)});
                }
                i++;
            }
        }
        iArr[0] = i;
        for (int i3 = 0; i3 < this.m_inputFormat.numAttributes(); i3++) {
            if (this.m_selectedRange.isInRange(i3) && !instance.isMissing(i3)) {
                this.m_tokenizer.tokenize(instance.stringValue(i3));
                while (this.m_tokenizer.hasMoreElements()) {
                    String nextElement = this.m_tokenizer.nextElement();
                    if (this.m_lowerCaseTokens) {
                        nextElement = nextElement.toLowerCase();
                    }
                    int[] iArr2 = this.m_consolidatedDict.get(this.m_stemmer.stem(nextElement));
                    if (iArr2 != null) {
                        if (this.m_outputCounts) {
                            double[] dArr = (double[]) treeMap.get(Integer.valueOf(iArr2[0] + i));
                            if (dArr != null) {
                                dArr[0] = dArr[0] + 1.0d;
                            } else {
                                treeMap.put(Integer.valueOf(iArr2[0] + i), new double[]{1.0d});
                            }
                        } else {
                            treeMap.put(Integer.valueOf(iArr2[0] + i), new double[]{1.0d});
                        }
                    }
                }
            }
        }
        if (this.m_TFTransform) {
            for (Map.Entry entry : treeMap.entrySet()) {
                if (((Integer) entry.getKey()).intValue() >= i) {
                    double[] dArr2 = (double[]) entry.getValue();
                    dArr2[0] = Math.log(dArr2[0] + 1.0d);
                }
            }
        }
        if (this.m_IDFTransform) {
            for (Map.Entry entry2 : treeMap.entrySet()) {
                int intValue = ((Integer) entry2.getKey()).intValue();
                if (intValue >= i) {
                    double[] dArr3 = (double[]) entry2.getValue();
                    int[] iArr3 = this.m_consolidatedDict.get(this.m_outputFormat.attribute(intValue).name().substring(this.m_Prefix.length()));
                    if (iArr3 == null) {
                        throw new Exception("This should never occur");
                    }
                    if (iArr3.length != 2) {
                        throw new Exception("Can't compute IDF transform as document counts are not available");
                    }
                    dArr3[0] = dArr3[0] * Math.log(this.m_count / iArr3[1]);
                }
            }
        }
        double[] dArr4 = new double[treeMap.size()];
        int[] iArr4 = new int[treeMap.size()];
        int i4 = 0;
        for (Map.Entry entry3 : treeMap.entrySet()) {
            dArr4[i4] = ((double[]) entry3.getValue())[0];
            int i5 = i4;
            i4++;
            iArr4[i5] = ((Integer) entry3.getKey()).intValue();
        }
        SparseInstance sparseInstance = new SparseInstance(instance.weight(), dArr4, iArr4, this.m_outputFormat.numAttributes());
        sparseInstance.setDataset(this.m_outputFormat);
        if (this.m_normalize) {
            normalizeInstance(sparseInstance, i);
        }
        return sparseInstance;
    }

    private void normalizeInstance(Instance instance, int i) throws Exception {
        if (this.m_avgDocLength <= KStarConstants.FLOOR) {
            throw new Exception("Average document length is not set!");
        }
        double d = 0.0d;
        for (int i2 = 0; i2 < instance.numValues(); i2++) {
            if (instance.index(i2) >= i && instance.index(i2) != this.m_outputFormat.classIndex()) {
                d += instance.valueSparse(i2) * instance.valueSparse(i2);
            }
        }
        double sqrt = Math.sqrt(d);
        int i3 = 0;
        while (i3 < instance.numValues()) {
            if (instance.index(i3) >= i && instance.index(i3) != this.m_outputFormat.classIndex()) {
                double valueSparse = (instance.valueSparse(i3) * this.m_avgDocLength) / sqrt;
                instance.setValueSparse(i3, valueSparse);
                if (valueSparse == KStarConstants.FLOOR) {
                    System.err.println("setting value " + instance.index(i3) + " to zero.");
                    i3--;
                }
            }
            i3++;
        }
    }

    public void processInstance(Instance instance) {
        if (this.m_inputContainsStringAttributes) {
            if (this.m_inputVector == null) {
                this.m_inputVector = new LinkedHashMap();
            } else {
                this.m_inputVector.clear();
            }
            int i = 0;
            if (!this.m_doNotOperateOnPerClassBasis && this.m_classIndex >= 0 && this.m_inputFormat.classAttribute().isNominal()) {
                if (instance.classIsMissing()) {
                    return;
                } else {
                    i = (int) instance.classValue();
                }
            }
            for (int i2 = 0; i2 < instance.numAttributes(); i2++) {
                if (this.m_selectedRange.isInRange(i2) && !instance.isMissing(i2)) {
                    this.m_tokenizer.tokenize(instance.stringValue(i2));
                    while (this.m_tokenizer.hasMoreElements()) {
                        String nextElement = this.m_tokenizer.nextElement();
                        if (this.m_lowerCaseTokens) {
                            nextElement = nextElement.toLowerCase();
                        }
                        String stem = this.m_stemmer.stem(nextElement);
                        if (!this.m_stopwordsHandler.isStopword(stem)) {
                            int[] iArr = this.m_inputVector.get(stem);
                            if (iArr == null) {
                                this.m_inputVector.put(stem, new int[]{1, 1});
                            } else {
                                iArr[0] = iArr[0] + 1;
                            }
                        }
                    }
                }
            }
            double d = 0.0d;
            for (Map.Entry<String, int[]> entry : this.m_inputVector.entrySet()) {
                int[] iArr2 = this.m_dictsPerClass[i].get(entry.getKey());
                if (iArr2 == null) {
                    iArr2 = new int[2];
                    this.m_dictsPerClass[i].put(entry.getKey(), iArr2);
                }
                int[] iArr3 = iArr2;
                iArr3[0] = iArr3[0] + entry.getValue()[0];
                int[] iArr4 = iArr2;
                iArr4[1] = iArr4[1] + entry.getValue()[1];
                d += entry.getValue()[0] * entry.getValue()[0];
            }
            if (this.m_normalize) {
                this.m_docLengthSum += Math.sqrt(d);
            }
            this.m_count++;
            pruneDictionary();
        }
    }

    protected void pruneDictionary() {
        if (this.m_periodicPruneRate <= 0 || this.m_count % this.m_periodicPruneRate != 0) {
            return;
        }
        for (Map<String, int[]> map : this.m_dictsPerClass) {
            Iterator<Map.Entry<String, int[]>> it = map.entrySet().iterator();
            while (it.hasNext()) {
                if (it.next().getValue()[0] < this.m_minFrequency) {
                    it.remove();
                }
            }
        }
    }

    public void reset() {
        this.m_dictsPerClass = null;
        this.m_count = 0;
        this.m_docLengthSum = KStarConstants.FLOOR;
        this.m_avgDocLength = KStarConstants.FLOOR;
        this.m_inputFormat = null;
        this.m_outputFormat = null;
        this.m_consolidatedDict = null;
    }

    public Map<String, int[]>[] getDictionaries(boolean z) throws WekaException {
        if (this.m_dictsPerClass == null) {
            throw new WekaException("No dictionaries have been built yet!");
        }
        if (z) {
            pruneDictionary();
        }
        return this.m_dictsPerClass;
    }

    @Override // weka.core.Aggregateable
    public DictionaryBuilder aggregate(DictionaryBuilder dictionaryBuilder) throws Exception {
        Map<String, int[]>[] dictionaries = dictionaryBuilder.getDictionaries(false);
        if (dictionaries.length != this.m_dictsPerClass.length) {
            throw new Exception("Number of dictionaries from the builder to be aggregated does not match our number of dictionaries");
        }
        for (int i = 0; i < dictionaries.length; i++) {
            for (Map.Entry<String, int[]> entry : dictionaries[i].entrySet()) {
                int[] iArr = this.m_dictsPerClass[i].get(entry.getKey());
                if (iArr == null) {
                    iArr = new int[2];
                    this.m_dictsPerClass[i].put(entry.getKey(), iArr);
                }
                int[] iArr2 = iArr;
                iArr2[0] = iArr2[0] + entry.getValue()[0];
                int[] iArr3 = iArr;
                iArr3[1] = iArr3[1] + entry.getValue()[1];
            }
        }
        this.m_count += dictionaryBuilder.m_count;
        this.m_docLengthSum += dictionaryBuilder.m_docLengthSum;
        return this;
    }

    @Override // weka.core.Aggregateable
    public void finalizeAggregation() throws Exception {
        finalizeDictionary();
    }

    public Map<String, int[]> finalizeDictionary() throws Exception {
        if (!this.m_inputContainsStringAttributes) {
            return null;
        }
        if (this.m_consolidatedDict != null) {
            return this.m_consolidatedDict;
        }
        if (this.m_dictsPerClass == null) {
            System.err.println(hashCode());
            throw new WekaException("No dictionary built yet!");
        }
        int[] iArr = new int[this.m_dictsPerClass.length];
        for (int i = 0; i < iArr.length; i++) {
            int[] iArr2 = new int[this.m_dictsPerClass[i].size()];
            int i2 = 0;
            Iterator<Map.Entry<String, int[]>> it = this.m_dictsPerClass[i].entrySet().iterator();
            while (it.hasNext()) {
                int i3 = i2;
                i2++;
                iArr2[i3] = it.next().getValue()[0];
            }
            if (iArr2.length < this.m_wordsToKeep) {
                iArr[i] = this.m_minFrequency;
            } else {
                Arrays.sort(iArr2);
                iArr[i] = Math.max(this.m_minFrequency, iArr2[iArr2.length - this.m_wordsToKeep]);
            }
        }
        LinkedHashMap linkedHashMap = new LinkedHashMap();
        int i4 = 0;
        for (int i5 = 0; i5 < iArr.length; i5++) {
            for (Map.Entry<String, int[]> entry : this.m_dictsPerClass[i5].entrySet()) {
                if (entry.getValue()[0] >= iArr[i5]) {
                    int[] iArr3 = (int[]) linkedHashMap.get(entry.getKey());
                    if (iArr3 == null) {
                        int i6 = i4;
                        i4++;
                        iArr3 = new int[]{i6};
                        linkedHashMap.put(entry.getKey(), iArr3);
                    }
                    int[] iArr4 = iArr3;
                    iArr4[1] = iArr4[1] + entry.getValue()[1];
                }
            }
        }
        this.m_consolidatedDict = linkedHashMap;
        this.m_dictsPerClass = null;
        if (this.m_normalize) {
            this.m_avgDocLength = this.m_docLengthSum / this.m_count;
        }
        this.m_outputFormat = getVectorizedFormat();
        return this.m_consolidatedDict;
    }

    public void loadDictionary(String str, boolean z) throws IOException {
        loadDictionary(new File(str), z);
    }

    public void loadDictionary(File file, boolean z) throws IOException {
        if (z) {
            loadDictionary(new FileReader(file));
        } else {
            loadDictionary(new FileInputStream(file));
        }
    }

    public void loadDictionary(Reader reader) throws IOException {
        BufferedReader bufferedReader = new BufferedReader(reader);
        this.m_consolidatedDict = new LinkedHashMap();
        try {
            String readLine = bufferedReader.readLine();
            int i = 0;
            if (readLine == null) {
                throw new IOException("Empty dictionary file!");
            }
            if (readLine.startsWith("@@@") && readLine.endsWith("@@@")) {
                String replace = readLine.replace("@@@", "");
                try {
                    this.m_avgDocLength = Double.parseDouble(replace);
                } catch (NumberFormatException e) {
                    System.err.println("Unable to parse average document length '" + replace + MetricListWriter.WEKA_DELIMITER);
                }
                readLine = bufferedReader.readLine();
                if (readLine == null) {
                    throw new IOException("Empty dictionary file!");
                }
            }
            boolean z = false;
            if (readLine.lastIndexOf(FaultExtractionEngine.EXPORT_CSV_DELIMITER) > 0) {
                try {
                    z = true;
                    i = 0 + 1;
                    this.m_consolidatedDict.put(readLine.substring(0, readLine.lastIndexOf(FaultExtractionEngine.EXPORT_CSV_DELIMITER)), new int[]{0, Integer.parseInt(readLine.substring(readLine.lastIndexOf(FaultExtractionEngine.EXPORT_CSV_DELIMITER) + 1, readLine.length()).trim())});
                } catch (NumberFormatException e2) {
                }
            }
            while (true) {
                String readLine2 = bufferedReader.readLine();
                String str = readLine2;
                if (readLine2 == null) {
                    try {
                        this.m_outputFormat = getVectorizedFormat();
                        return;
                    } catch (Exception e3) {
                        throw new IOException(e3);
                    }
                }
                int[] iArr = new int[z ? 2 : 1];
                int i2 = i;
                i++;
                iArr[0] = i2;
                if (z) {
                    String trim = str.substring(str.lastIndexOf(FaultExtractionEngine.EXPORT_CSV_DELIMITER) + 1, str.length()).trim();
                    str = str.substring(0, str.lastIndexOf(FaultExtractionEngine.EXPORT_CSV_DELIMITER));
                    try {
                        iArr[1] = Integer.parseInt(trim);
                    } catch (NumberFormatException e4) {
                        throw new IOException(e4);
                    }
                }
                this.m_consolidatedDict.put(str, iArr);
            }
        } finally {
            bufferedReader.close();
        }
    }

    public void loadDictionary(InputStream inputStream) throws IOException {
        ObjectInputStream objectInputStream = new ObjectInputStream(new BufferedInputStream(inputStream));
        try {
            try {
                List list = (List) objectInputStream.readObject();
                this.m_avgDocLength = ((Double) list.get(0)).doubleValue();
                this.m_consolidatedDict = (Map) list.get(1);
                objectInputStream.close();
            } catch (ClassNotFoundException e) {
                throw new IOException(e);
            }
        } catch (Throwable th) {
            objectInputStream.close();
            throw th;
        }
    }

    public void saveDictionary(String str, boolean z) throws IOException {
        saveDictionary(new File(str), z);
    }

    public void saveDictionary(File file, boolean z) throws IOException {
        if (z) {
            saveDictionary(new FileWriter(file));
        } else {
            saveDictionary(new FileOutputStream(file));
        }
    }

    public void saveDictionary(Writer writer) throws IOException {
        if (!this.m_inputContainsStringAttributes) {
            throw new IOException("Input did not contain any string attributes!");
        }
        if (this.m_consolidatedDict == null) {
            throw new IOException("No dictionary to save!");
        }
        BufferedWriter bufferedWriter = new BufferedWriter(writer);
        try {
            if (this.m_avgDocLength > KStarConstants.FLOOR) {
                bufferedWriter.write("@@@" + this.m_avgDocLength + "@@@\n");
            }
            for (Map.Entry<String, int[]> entry : this.m_consolidatedDict.entrySet()) {
                int[] value = entry.getValue();
                bufferedWriter.write(entry.getKey() + FaultExtractionEngine.EXPORT_CSV_DELIMITER + (value.length > 1 ? Integer.valueOf(value[1]) : "") + StringUtils.LF);
            }
        } finally {
            bufferedWriter.flush();
            bufferedWriter.close();
        }
    }

    public void saveDictionary(OutputStream outputStream) throws IOException {
        if (!this.m_inputContainsStringAttributes) {
            throw new IOException("Input did not contain any string attributes!");
        }
        if (this.m_consolidatedDict == null) {
            throw new IOException("No dictionary to save!");
        }
        ObjectOutputStream objectOutputStream = new ObjectOutputStream(new BufferedOutputStream(outputStream));
        ArrayList arrayList = new ArrayList();
        arrayList.add(Double.valueOf(this.m_avgDocLength));
        arrayList.add(this.m_consolidatedDict);
        try {
            objectOutputStream.writeObject(arrayList);
            objectOutputStream.flush();
            objectOutputStream.close();
        } catch (Throwable th) {
            objectOutputStream.flush();
            objectOutputStream.close();
            throw th;
        }
    }
}
