package org.openimaj.text.nlp;

import gov.sandia.cognition.text.token.DefaultToken;
import gov.sandia.cognition.text.token.Token;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.lang.StringEscapeUtils;
import org.openimaj.text.nlp.patterns.AbbreviationPatternProvider;
import org.openimaj.text.nlp.patterns.ComplicatedNumberPatternProvider;
import org.openimaj.text.nlp.patterns.EmailPatternProvider;
import org.openimaj.text.nlp.patterns.EmbeddedApostrophePatternProvider;
import org.openimaj.text.nlp.patterns.EmbeddedDashPatternProvider;
import org.openimaj.text.nlp.patterns.EmoticonPatternProvider;
import org.openimaj.text.nlp.patterns.EntityPatternProvider;
import org.openimaj.text.nlp.patterns.PunctuationPatternProvider;
import org.openimaj.text.nlp.patterns.TimePatternProvider;
import org.openimaj.text.nlp.patterns.TruncatedURLPatternProvider;
import org.openimaj.text.nlp.patterns.TwitterStuffPatternProvider;
import org.openimaj.text.nlp.patterns.URLPatternProvider;
import org.openimaj.text.util.RegexUtil;

/* loaded from: input_file:org/openimaj/text/nlp/TweetTokeniser.class */
public class TweetTokeniser implements Iterable<Token> {
    private String text;
    private ArrayList<Token> tokenize;
    private ArrayList<Token> protectedTokens;
    private ArrayList<Token> unprotectedTokens;
    private static final String spaceRegex = "\\s+";
    private static final Locale[] invalidLanguages = {new Locale("zh"), new Locale("ko"), new Locale("jp")};
    static EmoticonPatternProvider emoticons = new EmoticonPatternProvider();
    static PunctuationPatternProvider punctuation = new PunctuationPatternProvider();
    static EntityPatternProvider entity = new EntityPatternProvider();
    static TruncatedURLPatternProvider truncatedURL = new TruncatedURLPatternProvider();
    static URLPatternProvider url = new URLPatternProvider();
    static TimePatternProvider time = new TimePatternProvider();
    static ComplicatedNumberPatternProvider number = new ComplicatedNumberPatternProvider();
    static TwitterStuffPatternProvider twitterPart = new TwitterStuffPatternProvider();
    static EmailPatternProvider email = new EmailPatternProvider();
    static AbbreviationPatternProvider abbrev = new AbbreviationPatternProvider(entity);
    static String Separators = RegexUtil.regex_or_match("--+", "―");
    static String Decorations = new String(" [♫]+ ").replace(" ", "");
    static EmbeddedApostrophePatternProvider embedded = new EmbeddedApostrophePatternProvider(punctuation);
    static EmbeddedDashPatternProvider embeddedDash = new EmbeddedDashPatternProvider(punctuation);
    static String[] ProtectThese = {twitterPart.patternString(), emoticons.patternString(), truncatedURL.patternString(), url.patternString(), email.patternString(), entity.patternString(), time.patternString(), number.patternString(), embedded.patternString(), punctuation.patternString(), abbrev.patternString(), Separators, Decorations};
    static String oredProtect = RegexUtil.regex_or_match(ProtectThese);
    static Pattern Protect_RE = Pattern.compile(oredProtect, 66);

    public static boolean isValid(Locale locale) {
        return isValid(locale.getLanguage());
    }

    public static boolean isValid(String str) {
        for (Locale locale : invalidLanguages) {
            if (locale.getLanguage().equals(str)) {
                return false;
            }
        }
        return true;
    }

    public TweetTokeniser(String str) throws UnsupportedEncodingException, TweetTokeniserException {
        this.text = new String(str);
        fixEncoding();
        squeeze_whitespace();
        simple_tokenize();
    }

    private void simple_tokenize() throws TweetTokeniserException {
        this.tokenize = new ArrayList<>();
        edge_punct_munge();
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        ArrayList<Token> arrayList3 = new ArrayList<>();
        ArrayList<Token> arrayList4 = new ArrayList<>();
        ArrayList<Token> arrayList5 = new ArrayList<>();
        int i = 0;
        Matcher matcher = Protect_RE.matcher(this.text);
        if (matcher != null) {
            while (matcher.find()) {
                String substring = this.text.substring(i, matcher.start());
                arrayList.add(substring);
                List<Token> unprotected_tokenize = unprotected_tokenize(substring);
                arrayList3.addAll(unprotected_tokenize);
                arrayList4.addAll(unprotected_tokenize);
                String substring2 = this.text.substring(matcher.start(), matcher.end());
                arrayList2.add(substring2);
                DefaultToken defaultToken = new DefaultToken(substring2, 0);
                arrayList3.add(defaultToken);
                arrayList5.add(defaultToken);
                i = matcher.end();
            }
            List<Token> unprotected_tokenize2 = unprotected_tokenize(this.text.substring(i, this.text.length()));
            arrayList3.addAll(unprotected_tokenize2);
            arrayList4.addAll(unprotected_tokenize2);
        } else {
            List<Token> unprotected_tokenize3 = unprotected_tokenize(this.text.substring(0, this.text.length()));
            arrayList3.addAll(unprotected_tokenize3);
            arrayList4.addAll(unprotected_tokenize3);
        }
        this.tokenize = post_process(arrayList3);
        this.protectedTokens = post_process(arrayList5);
        this.unprotectedTokens = post_process(arrayList4);
    }

    private ArrayList<Token> post_process(ArrayList<Token> arrayList) {
        return arrayList;
    }

    private List<Token> unprotected_tokenize(String str) {
        String[] split = str.split(spaceRegex);
        ArrayList arrayList = new ArrayList();
        for (String str2 : split) {
            if (!str2.isEmpty()) {
                arrayList.add(new DefaultToken(str2, 0));
            }
        }
        return arrayList;
    }

    private void edge_punct_munge() {
    }

    private void squeeze_whitespace() {
        this.text = this.text.replaceAll(spaceRegex, " ");
    }

    private void fixEncoding() throws UnsupportedEncodingException {
        this.text = new String(this.text.getBytes("UTF-8"), "UTF-8");
        this.text = StringEscapeUtils.unescapeHtml(this.text);
    }

    @Override // java.lang.Iterable
    public Iterator<Token> iterator() {
        return this.tokenize.iterator();
    }

    public List<Token> getTokens() {
        return this.tokenize;
    }

    public List<String> getStringTokens() {
        ArrayList arrayList = new ArrayList();
        Iterator<Token> it = this.tokenize.iterator();
        while (it.hasNext()) {
            arrayList.add(it.next().getText());
        }
        return arrayList;
    }

    public List<String> getProtectedStringTokens() {
        ArrayList arrayList = new ArrayList();
        Iterator<Token> it = this.protectedTokens.iterator();
        while (it.hasNext()) {
            arrayList.add(it.next().getText());
        }
        return arrayList;
    }

    public List<String> getUnprotectedStringTokens() {
        ArrayList arrayList = new ArrayList();
        Iterator<Token> it = this.unprotectedTokens.iterator();
        while (it.hasNext()) {
            arrayList.add(it.next().getText());
        }
        return arrayList;
    }
}
