package org.openimaj.hadoop.tools.twitter.token.outputmode.sparsecsv;

import com.Ostermiller.util.CSVParser;
import com.Ostermiller.util.CSVPrinter;
import com.jmatio.io.MatFileWriter;
import com.jmatio.types.MLCell;
import com.jmatio.types.MLChar;
import com.jmatio.types.MLDouble;
import java.io.BufferedReader;
import java.io.DataInput;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.io.StringWriter;
import java.nio.channels.Channels;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.openimaj.hadoop.mapreduce.MultiStagedJob;
import org.openimaj.hadoop.mapreduce.stage.StageAppender;
import org.openimaj.hadoop.mapreduce.stage.helper.SequenceFileStage;
import org.openimaj.hadoop.mapreduce.stage.helper.SequenceFileTextStage;
import org.openimaj.hadoop.tools.HadoopToolsUtil;
import org.openimaj.hadoop.tools.twitter.token.outputmode.sparsecsv.WordIndexSort;
import org.openimaj.hadoop.tools.twitter.utils.WordDFIDF;
import org.openimaj.io.IOUtils;
import org.openimaj.io.wrappers.ReadableListBinary;
import org.openimaj.util.pair.IndependentPair;

/* loaded from: input_file:org/openimaj/hadoop/tools/twitter/token/outputmode/sparsecsv/WordIndex.class */
public class WordIndex extends StageAppender {
    protected static final String WORDCOUNT_THRESH = "org.openimaj.hadoop.tools.twitter.token.outputmode.sparsecsv.wordcountthresh";
    protected static final String WORDCOUNT_TOPN = "org.openimaj.hadoop.tools.twitter.token.outputmode.sparsecsv.wordcounttopn";
    protected static final String WORDCOUNT_TIMETHRESH = "org.openimaj.hadoop.tools.twitter.token.outputmode.sparsecsv.wordtimecountthresh";
    private int wordCountThreshold;
    private int topNWords;
    private int wordTimeThreshold;

    /* loaded from: input_file:org/openimaj/hadoop/tools/twitter/token/outputmode/sparsecsv/WordIndex$Map.class */
    public static class Map extends Mapper<Text, BytesWritable, Text, LongWritable> {
        private int wordTimeCountThresh;

        protected void setup(Mapper<Text, BytesWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException {
            this.wordTimeCountThresh = context.getConfiguration().getInt(WordIndex.WORDCOUNT_TIMETHRESH, 0);
        }

        public void map(Text text, BytesWritable bytesWritable, Mapper<Text, BytesWritable, Text, LongWritable>.Context context) throws InterruptedException {
            try {
                final long[] jArr = {0};
                final boolean[] zArr = {false};
                IOUtils.deserialize(bytesWritable.getBytes(), new ReadableListBinary<Object>(new ArrayList()) { // from class: org.openimaj.hadoop.tools.twitter.token.outputmode.sparsecsv.WordIndex.Map.1
                    protected Object readValue(DataInput dataInput) throws IOException {
                        WordDFIDF wordDFIDF = new WordDFIDF();
                        wordDFIDF.readBinary(dataInput);
                        if (wordDFIDF.wf > Map.this.wordTimeCountThresh) {
                            zArr[0] = true;
                        }
                        if (jArr[0] < wordDFIDF.Twf) {
                            jArr[0] = wordDFIDF.Twf;
                        }
                        return new Object();
                    }
                });
                if (zArr[0]) {
                    context.write(text, new LongWritable(jArr[0]));
                }
            } catch (IOException e) {
                System.err.println("Couldnt read word: " + text);
            }
        }

        public /* bridge */ /* synthetic */ void map(Object obj, Object obj2, Mapper.Context context) throws IOException, InterruptedException {
            map((Text) obj, (BytesWritable) obj2, (Mapper<Text, BytesWritable, Text, LongWritable>.Context) context);
        }
    }

    /* loaded from: input_file:org/openimaj/hadoop/tools/twitter/token/outputmode/sparsecsv/WordIndex$Reduce.class */
    public static class Reduce extends Reducer<Text, LongWritable, LongWritable, Text> {
        private int wordCountThresh;

        protected void setup(Reducer<Text, LongWritable, LongWritable, Text>.Context context) throws IOException, InterruptedException {
            this.wordCountThresh = context.getConfiguration().getInt(WordIndex.WORDCOUNT_THRESH, 0);
        }

        public void reduce(Text text, Iterable<LongWritable> iterable, Reducer<Text, LongWritable, LongWritable, Text>.Context context) throws IOException, InterruptedException {
            long j = 0;
            Iterator<LongWritable> it = iterable.iterator();
            while (it.hasNext()) {
                j += it.next().get();
            }
            if (j < this.wordCountThresh) {
                return;
            }
            StringWriter stringWriter = new StringWriter();
            CSVPrinter cSVPrinter = new CSVPrinter(stringWriter);
            cSVPrinter.write(new String[]{text.toString(), j + ""});
            cSVPrinter.flush();
            context.write(new LongWritable(j), new Text(stringWriter.toString()));
        }

        public /* bridge */ /* synthetic */ void reduce(Object obj, Iterable iterable, Reducer.Context context) throws IOException, InterruptedException {
            reduce((Text) obj, (Iterable<LongWritable>) iterable, (Reducer<Text, LongWritable, LongWritable, Text>.Context) context);
        }
    }

    public WordIndex(int i, int i2) {
        this.wordCountThreshold = i;
        this.topNWords = i2;
    }

    public WordIndex(int i, int i2, int i3) {
        this.wordCountThreshold = i;
        this.topNWords = i3;
        this.wordTimeThreshold = i2;
    }

    public WordIndex() {
        this.wordCountThreshold = 0;
        this.topNWords = -1;
    }

    public static LinkedHashMap<String, IndependentPair<Long, Long>> readWordCountLines(String str) throws IOException {
        return readWordCountLines(str, "/words");
    }

    public static LinkedHashMap<String, IndependentPair<Long, Long>> readWordCountLines(String str, String str2) throws IOException {
        Path path = HadoopToolsUtil.getInputPaths(str + str2)[0];
        CSVParser cSVParser = new CSVParser(new BufferedReader(new InputStreamReader((InputStream) HadoopToolsUtil.getFileSystem(path).open(path), "UTF-8")));
        long j = 0;
        LinkedHashMap<String, IndependentPair<Long, Long>> linkedHashMap = new LinkedHashMap<>();
        while (true) {
            String[] line = cSVParser.getLine();
            if (line == null || line.length <= 0) {
                break;
            }
            if (line.length != 2) {
                System.out.println("PROBLEM READLINE LINE: " + Arrays.toString(line));
            } else {
                linkedHashMap.put(line[0], IndependentPair.pair(Long.valueOf(Long.parseLong(line[1])), Long.valueOf(j)));
                j++;
            }
        }
        return linkedHashMap;
    }

    public void stage(MultiStagedJob multiStagedJob) {
        multiStagedJob.removeIntermediate(true);
        SequenceFileStage<Text, BytesWritable, Text, LongWritable, LongWritable, Text> sequenceFileStage = new SequenceFileStage<Text, BytesWritable, Text, LongWritable, LongWritable, Text>() { // from class: org.openimaj.hadoop.tools.twitter.token.outputmode.sparsecsv.WordIndex.1
            public void setup(Job job) {
                job.getConfiguration().setInt(WordIndex.WORDCOUNT_THRESH, WordIndex.this.wordCountThreshold);
                job.getConfiguration().setInt(WordIndex.WORDCOUNT_TIMETHRESH, WordIndex.this.wordTimeThreshold);
                job.setNumReduceTasks(1);
            }

            public Class<? extends Mapper<Text, BytesWritable, Text, LongWritable>> mapper() {
                return Map.class;
            }

            public Class<? extends Reducer<Text, LongWritable, LongWritable, Text>> reducer() {
                return Reduce.class;
            }

            public String outname() {
                return "words-collated";
            }
        };
        SequenceFileTextStage<LongWritable, Text, LongWritable, Text, NullWritable, Text> sequenceFileTextStage = new SequenceFileTextStage<LongWritable, Text, LongWritable, Text, NullWritable, Text>() { // from class: org.openimaj.hadoop.tools.twitter.token.outputmode.sparsecsv.WordIndex.2
            public void setup(Job job) {
                job.getConfiguration().setInt(WordIndex.WORDCOUNT_TOPN, WordIndex.this.topNWords);
                job.setSortComparatorClass(LongWritable.DecreasingComparator.class);
                job.setNumReduceTasks(1);
            }

            public Class<? extends Reducer<LongWritable, Text, NullWritable, Text>> reducer() {
                return WordIndexSort.Reduce.class;
            }

            public String outname() {
                return "words";
            }
        };
        multiStagedJob.queueStage(sequenceFileStage);
        multiStagedJob.queueStage(sequenceFileTextStage);
    }

    public static void main(String[] strArr) throws IOException {
        LinkedHashMap<String, IndependentPair<Long, Long>> readWordCountLines = readWordCountLines("/Users/ss/Development/data/trendminer/sheffield/2010/09/tweets.2010-09-01.sparsecsv");
        System.out.println("Number of words index: " + readWordCountLines.size());
        for (Map.Entry<String, IndependentPair<Long, Long>> entry : readWordCountLines.entrySet()) {
            if (entry.getValue() == null) {
                System.out.println(entry.getKey() + " was null!");
            }
        }
        System.out.println(readWordCountLines.get("!"));
    }

    /* JADX WARN: Type inference failed for: r4v4, types: [double[], double[][]] */
    public static void writeToMatlab(String str) throws IOException {
        Path path = new Path(str + "/words/wordIndex.mat");
        FileSystem fileSystem = HadoopToolsUtil.getFileSystem(path);
        LinkedHashMap<String, IndependentPair<Long, Long>> readWordCountLines = readWordCountLines(str);
        MLCell mLCell = new MLCell("words", new int[]{readWordCountLines.size(), 2});
        System.out.println("... reading words");
        for (Map.Entry<String, IndependentPair<Long, Long>> entry : readWordCountLines.entrySet()) {
            String key = entry.getKey();
            int longValue = (int) ((Long) entry.getValue().secondObject()).longValue();
            long longValue2 = ((Long) entry.getValue().firstObject()).longValue();
            mLCell.set(new MLChar((String) null, key), longValue, 0);
            mLCell.set(new MLDouble((String) null, (double[][]) new double[]{new double[]{longValue2}}), longValue, 1);
        }
        ArrayList arrayList = new ArrayList();
        arrayList.add(mLCell);
        new MatFileWriter(Channels.newChannel((OutputStream) fileSystem.create(path)), arrayList);
    }
}
