package org.openimaj.hadoop.tools.twitter.token.outputmode.sparsecsv;

import org.kohsuke.args4j.Option;
import org.openimaj.hadoop.mapreduce.MultiStagedJob;
import org.openimaj.hadoop.tools.HadoopToolsUtil;
import org.openimaj.hadoop.tools.twitter.HadoopTwitterTokenToolOptions;
import org.openimaj.hadoop.tools.twitter.token.mode.TwitterTokenMode;
import org.openimaj.hadoop.tools.twitter.token.mode.dfidf.CountTweetsInTimeperiod;
import org.openimaj.hadoop.tools.twitter.token.mode.dfidf.CountWordsAcrossTimeperiod;
import org.openimaj.hadoop.tools.twitter.token.outputmode.TwitterTokenOutputMode;

/* loaded from: input_file:org/openimaj/hadoop/tools/twitter/token/outputmode/sparsecsv/SparseCSVTokenOutputMode.class */
public class SparseCSVTokenOutputMode extends TwitterTokenOutputMode {
    private MultiStagedJob stages;

    @Option(name = "--value-reduce-split", aliases = {"-vrs"}, required = false, usage = "The number of reducers to use when spitting out the DFIDF values")
    int valueSplitReduce = 1;

    @Option(name = "--word-occurence-threshold", aliases = {"-wot"}, required = false, usage = "The number of times a given word must appear total throughout the time period before it is involved in the count and index")
    int wordCountThreshold = 0;

    @Option(name = "--word-time-occurence-threshold", aliases = {"-wtot"}, required = false, usage = "The number of times a given word must appear in one or more time period before the word is chosen for indexing")
    int wordTimeCountThreshold = 0;

    @Option(name = "--top-n-words", aliases = {"-tnw"}, required = false, usage = "Select only the top n words (as ordered by total occurence in the time period)")
    int topNWords = -1;

    @Option(name = "--sort-value-by-time", aliases = {"-svbt"}, required = false, usage = "This flag sorts value by time instead of word")
    boolean sortValueByTime = false;

    @Option(name = "--matlab-output", aliases = {"-matlab"}, required = false, usage = "This flag sorts value by time instead of word")
    boolean matlabOutput = false;

    @Override // org.openimaj.hadoop.tools.twitter.token.outputmode.TwitterTokenOutputMode
    public void write(HadoopTwitterTokenToolOptions hadoopTwitterTokenToolOptions, TwitterTokenMode twitterTokenMode) throws Exception {
        HadoopToolsUtil.validateOutput(this.outputPath, this.replace);
        this.stages = new MultiStagedJob(HadoopToolsUtil.getInputPaths(twitterTokenMode.finalOutput(hadoopTwitterTokenToolOptions), CountWordsAcrossTimeperiod.WORDCOUNT_DIR), HadoopToolsUtil.getOutputPath(this.outputPath), hadoopTwitterTokenToolOptions.getArgs());
        this.matlabOutput = this.matlabOutput && this.sortValueByTime;
        new WordIndex(this.wordCountThreshold, this.wordTimeCountThreshold, this.topNWords).stage(this.stages);
        this.stages.runAll();
        this.stages = new MultiStagedJob(HadoopToolsUtil.getInputPaths(twitterTokenMode.finalOutput(hadoopTwitterTokenToolOptions), CountTweetsInTimeperiod.TIMECOUNT_DIR), HadoopToolsUtil.getOutputPath(this.outputPath), hadoopTwitterTokenToolOptions.getArgs());
        this.stages.queueStage(new TimeIndex().m22stage());
        this.stages.runAll();
        this.stages = new MultiStagedJob(HadoopToolsUtil.getInputPaths(twitterTokenMode.finalOutput(hadoopTwitterTokenToolOptions), CountWordsAcrossTimeperiod.WORDCOUNT_DIR), HadoopToolsUtil.getOutputPath(this.outputPath), hadoopTwitterTokenToolOptions.getArgs());
        this.stages.queueStage(new Values(this.outputPath, this.valueSplitReduce, this.sortValueByTime, this.matlabOutput).m23stage());
        this.stages.runAll();
    }
}
