package org.openimaj.web.readability;

import java.io.IOException;
import java.io.StringReader;
import java.net.URL;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.EnumSet;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.cyberneko.html.parsers.DOMFragmentParser;
import org.cyberneko.html.parsers.DOMParser;
import org.pojava.datetime.DateTime;
import org.w3c.dom.DOMException;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.w3c.dom.bootstrap.DOMImplementationRegistry;
import org.w3c.dom.ls.DOMImplementationLS;
import org.w3c.dom.ls.LSSerializer;
import org.w3c.dom.traversal.DocumentTraversal;
import org.w3c.dom.traversal.NodeFilter;
import org.w3c.dom.traversal.TreeWalker;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/* loaded from: input_file:org/openimaj/web/readability/Readability.class */
public class Readability {
    public static float LINK_DENSITY_THRESHOLD = 0.33f;
    protected Document document;
    private Node bodyCache;
    protected EnumSet<Flag> flags;
    protected String articleTitle;
    protected Element articleContent;
    protected String article_date_string;
    protected Date article_date;
    protected String article_contentType;
    protected boolean debug;
    protected boolean addTitle;

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:org/openimaj/web/readability/Readability$Flag.class */
    public enum Flag {
        FLAG_STRIP_UNLIKELYS,
        FLAG_WEIGHT_CLASSES
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:org/openimaj/web/readability/Readability$MappingNode.class */
    public class MappingNode {
        String id;
        String text;

        public MappingNode(String str, String str2) {
            this.id = str;
            this.text = str2;
        }

        public String getId() {
            return this.id;
        }

        public String getText() {
            return this.text;
        }

        public String toString() {
            return "MappingNode(" + this.id + " -> " + this.text + ")";
        }
    }

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:org/openimaj/web/readability/Readability$Regexps.class */
    public static class Regexps {
        public static String unlikelyCandidatesRe = "(?i)combx|comment|disqus|foot|header|menu|rss|shoutbox|sidebar|sponsor|story-feature|banner";
        public static String okMaybeItsACandidateRe = "(?i)and|comments|article|body|column|main";
        public static String positiveRe = "(?i)article|body|comments|content|entry|hentry|page|pagination|post|text";
        public static String negativeRe = "(?i)combx|comment|contact|foot|footer|footnote|link|masthead|media|meta|promo|related|scroll|shoutbox|sponsor|tags|widget|warning";
        public static String divToPElementsRe = "(?i)(a|blockquote|dl|div|img|ol|p|pre|table|ul)";
        public static String replaceBrsRe = "(?i)(<br[^>]*>[ \n\r\t]*){2,}";
        public static String replaceFontsRe = "(?i)<(\\/?)font[^>]*>";
        public static String trimRe = "^\\s+|\\s+$";
        public static String normalizeRe = "\\s{2,}";
        public static String killBreaksRe = "(<br\\s*\\/?>(\\s|&nbsp;?)*){1,}";
        public static String videoRe = "(?i)http:\\/\\/(www\\.)?(youtube|vimeo)\\.com";
        public static String titleSeparatorRe = "\\|\\-\\/";
        public static String likelySubheadCandidateRe = "(?i)cross-head";

        protected Regexps() {
        }
    }

    public Readability(Document document) {
        this(document, false);
    }

    public Readability(Document document, boolean z) {
        this(document, z, false);
    }

    public Readability(Document document, boolean z, boolean z2) {
        this.flags = EnumSet.allOf(Flag.class);
        this.debug = false;
        this.addTitle = false;
        this.debug = z;
        this.document = document;
        this.addTitle = z2;
        augmentDocument(document);
        init();
    }

    public static void augmentDocument(Document document) {
        traverseLevel(((DocumentTraversal) document).createTreeWalker(document, 1, (NodeFilter) null, true), 0);
    }

    private static int traverseLevel(TreeWalker treeWalker, int i) {
        Node currentNode = treeWalker.getCurrentNode();
        if ((currentNode instanceof Element) && ((Element) currentNode).getAttribute("id").length() == 0) {
            ((Element) currentNode).setAttribute("id", "gen-id-" + i);
            i++;
        }
        Node firstChild = treeWalker.firstChild();
        while (firstChild != null) {
            i = traverseLevel(treeWalker, i);
            firstChild = treeWalker.nextSibling();
        }
        treeWalker.setCurrentNode(currentNode);
        return i;
    }

    protected void dbg(String str) {
        if (this.debug) {
            System.err.println(str);
        }
    }

    protected String getTitle() {
        NodeList elementsByTagName = this.document.getElementsByTagName("title");
        return elementsByTagName.getLength() == 0 ? "" : elementsByTagName.item(0).getTextContent();
    }

    protected String[] match(String str, String str2) {
        Matcher matcher = Pattern.compile(str2).matcher(str);
        ArrayList arrayList = new ArrayList();
        while (matcher.find()) {
            arrayList.add(matcher.group(0));
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }

    public boolean hasContent() {
        return this.articleContent != null;
    }

    protected int search(String str, String str2) {
        Matcher matcher = Pattern.compile(str2).matcher(str);
        if (matcher.find()) {
            return matcher.start();
        }
        return -1;
    }

    protected void findArticleEncoding() {
        NodeList elementsByTagName = this.document.getElementsByTagName("meta");
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            if (((Element) elementsByTagName.item(i)).getAttribute("http-equiv").equals("Content-Type")) {
                this.article_contentType = ((Element) elementsByTagName.item(i)).getAttribute("content");
                return;
            }
        }
    }

    protected void findArticleDate() {
        NodeList elementsByTagName = this.document.getElementsByTagName("meta");
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            if (((Element) elementsByTagName.item(i)).getAttribute("name").equals("OriginalPublicationDate")) {
                this.article_date_string = ((Element) elementsByTagName.item(i)).getAttribute("content");
                this.article_date = DateTime.parse(this.article_date_string).toDate();
                return;
            } else {
                if (((Element) elementsByTagName.item(i)).getAttribute("name").equals("DC.date.issued")) {
                    this.article_date_string = ((Element) elementsByTagName.item(i)).getAttribute("content");
                    this.article_date = DateTime.parse(this.article_date_string).toDate();
                    return;
                }
            }
        }
        NodeList elementsByTagName2 = this.document.getElementsByTagName("time");
        for (int i2 = 0; i2 < elementsByTagName2.getLength(); i2++) {
            if (((Element) elementsByTagName2.item(i2)).getAttributeNode("pubdate") != null) {
                this.article_date_string = ((Element) elementsByTagName2.item(i2)).getAttribute("datetime");
                this.article_date = DateTime.parse(this.article_date_string).toDate();
                return;
            }
        }
        NodeList elementsByTagName3 = this.document.getElementsByTagName("*");
        for (int i3 = 0; i3 < elementsByTagName3.getLength(); i3++) {
            if ((((Element) elementsByTagName3.item(i3)).getAttribute("class").contains("date") || ((Element) elementsByTagName3.item(i3)).getAttribute("class").contains("Date")) && !((Element) elementsByTagName3.item(i3)).getAttribute("class").contains("update") && !((Element) elementsByTagName3.item(i3)).getAttribute("class").contains("Update")) {
                this.article_date_string = getInnerTextSep((Element) elementsByTagName3.item(i3)).trim();
                parseDate();
                return;
            }
        }
        for (int i4 = 0; i4 < elementsByTagName3.getLength(); i4++) {
            if ((((Element) elementsByTagName3.item(i4)).getAttribute("id").contains("date") || ((Element) elementsByTagName3.item(i4)).getAttribute("id").contains("Date")) && !((Element) elementsByTagName3.item(i4)).getAttribute("id").contains("update") && !((Element) elementsByTagName3.item(i4)).getAttribute("id").contains("Update")) {
                this.article_date_string = getInnerTextSep((Element) elementsByTagName3.item(i4)).trim();
                parseDate();
                return;
            }
        }
        NodeList elementsByTagName4 = this.document.getElementsByTagName("*");
        for (int i5 = 0; i5 < elementsByTagName4.getLength(); i5++) {
            String textContent = elementsByTagName4.item(i5).getTextContent();
            if (textContent != null) {
                Matcher matcher = Pattern.compile("Last updated at (\\d+:\\d\\d [AP]M on \\d+[thsndr]+ \\w+ \\d\\d\\d\\d)").matcher(textContent);
                if (matcher.find()) {
                    this.article_date_string = matcher.group(1);
                    try {
                        this.article_date = new SimpleDateFormat("h:mm a 'on' dd MMMM yyyy").parse(this.article_date_string.replaceAll("th", "").replaceAll("st", "").replaceAll("nd", "").replaceAll("rd", ""));
                        return;
                    } catch (ParseException e) {
                        return;
                    }
                }
            }
        }
    }

    protected void parseDate() {
        if (this.article_date_string == null || this.article_date_string.trim().isEmpty()) {
            return;
        }
        if (this.article_date_string.contains("Today")) {
            try {
                this.article_date = new SimpleDateFormat("'Today @' HH:mm z").parse(this.article_date_string);
                Date date = new Date();
                this.article_date.setDate(date.getDate());
                this.article_date.setMonth(date.getMonth());
                this.article_date.setYear(date.getYear());
                return;
            } catch (ParseException e) {
                return;
            }
        }
        try {
            this.article_date = new SimpleDateFormat("h:mm z',' E',' dd M yyyy").parse(this.article_date_string);
        } catch (ParseException e2) {
            try {
                this.article_date = new SimpleDateFormat("dd.MM.yyyy '@' HH:mm z").parse(this.article_date_string);
            } catch (ParseException e3) {
                try {
                    this.article_date = new SimpleDateFormat("dd/MM/yyyy").parse(this.article_date_string);
                } catch (ParseException e4) {
                    try {
                        this.article_date = DateTime.parse(this.article_date_string).toDate();
                    } catch (ArrayIndexOutOfBoundsException e5) {
                        System.out.println(this.article_date_string);
                    } catch (IllegalArgumentException e6) {
                    }
                }
            }
        }
    }

    protected String findArticleTitle() {
        String title = getTitle();
        String str = title;
        ArrayList<String> arrayList = new ArrayList();
        for (int i = 1; i <= 6; i++) {
            NodeList elementsByTagName = this.document.getElementsByTagName("h" + i);
            if (elementsByTagName.getLength() > 0) {
                for (int i2 = 0; i2 < elementsByTagName.getLength(); i2++) {
                    arrayList.add(elementsByTagName.item(i2).getTextContent().trim());
                }
            }
        }
        String str2 = null;
        int i3 = 0;
        for (String str3 : arrayList) {
            if (str3.length() > i3 && str.contains(str3)) {
                str2 = str3;
                i3 = str3.length();
            }
        }
        if (str2 != null) {
            return str2;
        }
        if (match(str, " [" + Regexps.titleSeparatorRe + "]+ ").length > 0) {
            str = title.replaceAll("(.*) [" + Regexps.titleSeparatorRe + "]+ .*", "$1");
            if (str.split(" ").length < 3) {
                str = title.replaceAll("(?i)[^" + Regexps.titleSeparatorRe + "]*[" + Regexps.titleSeparatorRe + "]+(.*)", "$1");
            }
        } else if (str.indexOf(": ") != -1) {
            str = title.replaceAll("(?i).*:(.*)", "$1");
            if (str.split(" ").length < 3) {
                str = title.replaceAll("(?i)[^:]*[:](.*)", "$1");
            }
        } else if (str.length() > 150 || str.length() < 15) {
            NodeList elementsByTagName2 = this.document.getElementsByTagName("h1");
            if (elementsByTagName2.getLength() == 1) {
                str = getInnerText((Element) elementsByTagName2.item(0));
            }
        }
        String replaceAll = str.replaceAll(Regexps.trimRe, "");
        if (replaceAll.split(" ").length <= 3) {
            replaceAll = title;
        }
        return replaceAll;
    }

    protected Element getBody() {
        NodeList elementsByTagName = this.document.getElementsByTagName("body");
        if (elementsByTagName.getLength() == 0) {
            return null;
        }
        return (Element) elementsByTagName.item(0);
    }

    protected void init() {
        if (getBody() != null && this.bodyCache == null) {
            this.bodyCache = getBody().cloneNode(true);
        }
        findArticleDate();
        findArticleEncoding();
        prepDocument();
        this.articleTitle = findArticleTitle();
        this.articleContent = grabArticle();
        if (getInnerText(this.articleContent, false).length() < 250) {
            if (this.flags.contains(Flag.FLAG_STRIP_UNLIKELYS)) {
                this.flags.remove(Flag.FLAG_STRIP_UNLIKELYS);
                getBody().getParentNode().replaceChild(this.bodyCache, getBody());
                init();
                return;
            } else {
                if (this.flags.contains(Flag.FLAG_WEIGHT_CLASSES)) {
                    this.flags.remove(Flag.FLAG_WEIGHT_CLASSES);
                    getBody().getParentNode().replaceChild(this.bodyCache, getBody());
                    init();
                    return;
                }
                this.articleContent = null;
            }
        }
        if (!this.addTitle || this.articleContent == null) {
            return;
        }
        Element createElement = this.document.createElement("h1");
        createElement.setAttribute("id", "title");
        createElement.appendChild(this.document.createTextNode(getArticleTitle()));
        this.articleContent.insertBefore(createElement, this.articleContent.getFirstChild());
    }

    protected void prepDocument() {
        if (getBody() == null) {
            this.document.appendChild(this.document.createElement("body"));
        }
        NodeList elementsByTagName = this.document.getElementsByTagName("script");
        for (int length = elementsByTagName.getLength() - 1; length >= 0; length--) {
            elementsByTagName.item(length).getParentNode().removeChild(elementsByTagName.item(length));
        }
        NodeList elementsByTagName2 = this.document.getElementsByTagName("style");
        for (int i = 0; i < elementsByTagName2.getLength(); i++) {
            elementsByTagName2.item(i).getParentNode().removeChild(elementsByTagName2.item(i));
        }
        NodeList elementsByTagName3 = this.document.getElementsByTagName("meta");
        for (int i2 = 0; i2 < elementsByTagName3.getLength(); i2++) {
            elementsByTagName3.item(i2).getParentNode().removeChild(elementsByTagName3.item(i2));
        }
        Element body = getBody();
        Node stringToNode = stringToNode(getInnerHTML(body).replaceAll(Regexps.replaceBrsRe, "</P><P>").replaceAll(Regexps.replaceFontsRe, "<$1span>"));
        removeChildren(body);
        body.appendChild(stringToNode);
        removeComments(this.document);
    }

    protected void removeComments(Node node) {
        if (node.getNodeType() == 8) {
            node.getParentNode().removeChild(node);
            return;
        }
        NodeList childNodes = node.getChildNodes();
        for (int i = 0; i < childNodes.getLength(); i++) {
            removeComments(childNodes.item(i));
        }
    }

    protected void prepArticle(Element element) {
        cleanStyles(element);
        killBreaks(element);
        clean(element, "form");
        clean(element, "object");
        clean(element, "h1");
        if (element.getElementsByTagName("h2").getLength() == 1) {
            clean(element, "h2");
        }
        clean(element, "iframe");
        cleanHeaders(element);
        cleanConditionally(element, "table");
        cleanConditionally(element, "ul");
        cleanConditionally(element, "div");
        NodeList elementsByTagName = element.getElementsByTagName("p");
        for (int length = elementsByTagName.getLength() - 1; length >= 0; length--) {
            int length2 = ((Element) elementsByTagName.item(length)).getElementsByTagName("img").getLength();
            int length3 = ((Element) elementsByTagName.item(length)).getElementsByTagName("embed").getLength();
            int length4 = ((Element) elementsByTagName.item(length)).getElementsByTagName("object").getLength();
            if (length2 == 0 && length3 == 0 && length4 == 0 && getInnerText((Element) elementsByTagName.item(length), false) == "") {
                elementsByTagName.item(length).getParentNode().removeChild(elementsByTagName.item(length));
            }
        }
        Node stringToNode = stringToNode(getInnerHTML(element).replaceAll("(?i)<br[^>]*>\\s*<p", "<P"));
        removeChildren(element);
        element.appendChild(stringToNode);
        NodeList elementsByTagName2 = element.getElementsByTagName("p");
        for (int length5 = elementsByTagName2.getLength() - 1; length5 >= 0; length5--) {
            if (elementsByTagName2.item(length5).getTextContent().trim().length() == 0) {
                elementsByTagName2.item(length5).getParentNode().removeChild(elementsByTagName2.item(length5));
            } else if (elementsByTagName2.item(length5).getChildNodes().getLength() == 1 && elementsByTagName2.item(length5).getChildNodes().item(0).getNodeType() == 3) {
                elementsByTagName2.item(length5).setTextContent("\n" + elementsByTagName2.item(length5).getTextContent().trim() + "\n");
            } else if (((Element) elementsByTagName2.item(length5)).getAttribute("class").equals("readability-styled")) {
                elementsByTagName2.item(length5).getParentNode().replaceChild(this.document.createTextNode(elementsByTagName2.item(length5).getTextContent()), elementsByTagName2.item(length5));
            }
        }
    }

    protected void removeChildren(Node node) {
        NodeList childNodes = node.getChildNodes();
        int length = childNodes.getLength();
        for (int i = 0; i < length; i++) {
            node.removeChild(childNodes.item(0));
        }
    }

    protected void initializeNode(Element element) {
        float f = 0.0f;
        if (element.getTagName() == "DIV") {
            f = 0.0f + 5.0f;
        } else if (element.getTagName() == "PRE" || element.getTagName() == "TD" || element.getTagName() == "BLOCKQUOTE") {
            f = 0.0f + 3.0f;
        } else if (element.getTagName() == "ADDRESS" || element.getTagName() == "OL" || element.getTagName() == "UL" || element.getTagName() == "DL" || element.getTagName() == "DD" || element.getTagName() == "DT" || element.getTagName() == "LI" || element.getTagName() == "FORM") {
            f = 0.0f - 3.0f;
        } else if (element.getTagName() == "H1" || element.getTagName() == "H2" || element.getTagName() == "H3" || element.getTagName() == "H4" || element.getTagName() == "H5" || element.getTagName() == "H6" || element.getTagName() == "TH") {
            f = 0.0f - 5.0f;
        }
        element.setUserData("readability", Float.valueOf(f + getClassWeight(element)), null);
    }

    protected int getClassWeight(Element element) {
        if (!this.flags.contains(Flag.FLAG_WEIGHT_CLASSES)) {
            return 0;
        }
        int i = 0;
        if (element.getAttribute("class") != "") {
            if (search(element.getAttribute("class"), Regexps.negativeRe) != -1) {
                i = 0 - 25;
            }
            if (search(element.getAttribute("class"), Regexps.positiveRe) != -1) {
                i += 25;
            }
        }
        if (element.getAttribute("id") != "") {
            if (search(element.getAttribute("id"), Regexps.negativeRe) != -1) {
                i -= 25;
            }
            if (search(element.getAttribute("id"), Regexps.positiveRe) != -1) {
                i += 25;
            }
        }
        return i;
    }

    protected void cleanStyles() {
        cleanStyles((Element) this.document);
    }

    protected void cleanStyles(Element element) {
        if (element == null) {
            return;
        }
        if (!element.getAttribute("class").equals("readability-styled")) {
            element.removeAttribute("style");
        }
        for (Node firstChild = element.getFirstChild(); firstChild != null; firstChild = firstChild.getNextSibling()) {
            if (firstChild.getNodeType() == 1) {
                if (!((Element) firstChild).getAttribute("class").equals("readability-styled")) {
                    ((Element) firstChild).removeAttribute("style");
                }
                cleanStyles((Element) firstChild);
            }
        }
    }

    protected void killBreaks(Element element) {
        Node stringToNode = stringToNode(getInnerHTML(element).replaceAll(Regexps.killBreaksRe, "<BR />"));
        removeChildren(element);
        element.appendChild(stringToNode);
    }

    protected void clean(Element element, String str) {
        NodeList elementsByTagName = element.getElementsByTagName(str);
        boolean z = str.equals("object") || str.equals("embed");
        for (int length = elementsByTagName.getLength() - 1; length >= 0; length--) {
            if (z) {
                String str2 = "";
                int length2 = elementsByTagName.item(length).getAttributes().getLength();
                for (int i = 0; i < length2; i++) {
                    str2 = str2 + elementsByTagName.item(length).getAttributes().item(i).getNodeValue() + "|";
                }
                if (search(str2, Regexps.videoRe) == -1) {
                    if (search(getInnerHTML(elementsByTagName.item(length)), Regexps.videoRe) != -1) {
                    }
                }
            }
            elementsByTagName.item(length).getParentNode().removeChild(elementsByTagName.item(length));
        }
    }

    protected void cleanHeaders(Element element) {
        for (int i = 1; i < 7; i++) {
            NodeList elementsByTagName = element.getElementsByTagName("h" + i);
            for (int length = elementsByTagName.getLength() - 1; length >= 0; length--) {
                if (getClassWeight((Element) elementsByTagName.item(length)) < 0 || getLinkDensity((Element) elementsByTagName.item(length)) > LINK_DENSITY_THRESHOLD) {
                    elementsByTagName.item(length).getParentNode().removeChild(elementsByTagName.item(length));
                }
            }
        }
    }

    protected float getLinkDensity(Element element) {
        NodeList elementsByTagName = element.getElementsByTagName("a");
        int length = getInnerText(element).length();
        int i = 0;
        int length2 = elementsByTagName.getLength();
        for (int i2 = 0; i2 < length2; i2++) {
            i += getInnerText((Element) elementsByTagName.item(i2)).length();
        }
        if (i == 0) {
            return 0.0f;
        }
        return i / length;
    }

    protected void cleanConditionally(Element element, String str) {
        NodeList elementsByTagName = element.getElementsByTagName(str);
        for (int length = elementsByTagName.getLength() - 1; length >= 0; length--) {
            int classWeight = getClassWeight((Element) elementsByTagName.item(length));
            float floatValue = elementsByTagName.item(length).getUserData("readability") != null ? ((Float) elementsByTagName.item(length).getUserData("readability")).floatValue() : 0.0f;
            dbg("Cleaning Conditionally " + elementsByTagName.item(length) + " (" + ((Element) elementsByTagName.item(length)).getAttribute("class") + ":" + ((Element) elementsByTagName.item(length)).getAttribute("id") + ")" + (elementsByTagName.item(length).getUserData("readability") != null ? " with score " + elementsByTagName.item(length).getUserData("readability") : ""));
            if (classWeight + floatValue < 0.0f) {
                dbg("Removing " + elementsByTagName.item(length) + " (" + ((Element) elementsByTagName.item(length)).getAttribute("class") + ":" + ((Element) elementsByTagName.item(length)).getAttribute("id") + ")");
                elementsByTagName.item(length).getParentNode().removeChild(elementsByTagName.item(length));
            } else if (getCharCount((Element) elementsByTagName.item(length), ",") < 10) {
                int length2 = ((Element) elementsByTagName.item(length)).getElementsByTagName("p").getLength();
                int length3 = ((Element) elementsByTagName.item(length)).getElementsByTagName("img").getLength();
                int length4 = ((Element) elementsByTagName.item(length)).getElementsByTagName("li").getLength() - 100;
                int length5 = ((Element) elementsByTagName.item(length)).getElementsByTagName("input").getLength();
                int i = 0;
                NodeList elementsByTagName2 = ((Element) elementsByTagName.item(length)).getElementsByTagName("embed");
                int length6 = elementsByTagName2.getLength();
                for (int i2 = 0; i2 < length6; i2++) {
                    if (search(((Element) elementsByTagName2.item(i2)).getAttribute("src"), Regexps.videoRe) == -1) {
                        i++;
                    }
                }
                float linkDensity = getLinkDensity((Element) elementsByTagName.item(length));
                int length7 = getInnerText((Element) elementsByTagName.item(length)).length();
                boolean z = false;
                if (length3 > length2) {
                    z = true;
                } else if (length4 > length2 && str != "ul" && str != "ol") {
                    z = true;
                } else if (length5 > Math.floor(length2 / 3)) {
                    z = true;
                } else if (length7 < 25 && (length3 == 0 || length3 > 2)) {
                    z = true;
                } else if (classWeight < 25 && linkDensity > 0.2d) {
                    z = true;
                } else if (classWeight >= 25 && linkDensity > 0.5d) {
                    z = true;
                } else if ((i == 1 && length7 < 75) || i > 1) {
                    z = true;
                }
                if (length3 == 1 && length2 == 0 && length7 == 0) {
                    Element element2 = (Element) ((Element) elementsByTagName.item(length)).getElementsByTagName("img").item(0);
                    String attribute = element2.getAttribute("width") != null ? element2.getAttribute("width") : "";
                    String attribute2 = element2.getAttribute("height") != null ? element2.getAttribute("height") : "";
                    if (!attribute.equals("0") && !attribute2.equals("0")) {
                        z = false;
                    }
                }
                if (z) {
                    dbg("Removing " + elementsByTagName.item(length) + " (" + ((Element) elementsByTagName.item(length)).getAttribute("class") + ":" + ((Element) elementsByTagName.item(length)).getAttribute("id") + ")");
                    elementsByTagName.item(length).getParentNode().removeChild(elementsByTagName.item(length));
                }
            }
        }
    }

    protected int getCharCount(Element element, String str) {
        return getInnerText(element).split(str).length - 1;
    }

    protected int getCharCount(Element element) {
        return getCharCount(element, ",");
    }

    public String getArticleTitle() {
        return this.articleTitle;
    }

    public String getArticleContentType() {
        return this.article_contentType;
    }

    protected Element grabArticle() {
        Node node;
        boolean contains = this.flags.contains(Flag.FLAG_STRIP_UNLIKELYS);
        ArrayList arrayList = new ArrayList();
        int i = 0;
        while (true) {
            Element element = (Element) this.document.getElementsByTagName("*").item(i);
            if (element == null) {
                break;
            }
            if (contains) {
                String str = element.getAttribute("class") + element.getAttribute("id");
                if (search(str, Regexps.unlikelyCandidatesRe) != -1 && search(str, Regexps.okMaybeItsACandidateRe) == -1 && !element.getTagName().equals("BODY")) {
                    dbg("Removing unlikely candidate - " + str);
                    element.getParentNode().removeChild(element);
                    i--;
                    i++;
                }
            }
            if (element.getTagName().equals("P") || element.getTagName().equals("TD")) {
                arrayList.add(element);
            }
            if (element.getTagName().equals("DIV")) {
                if (search(getInnerHTML(element), Regexps.divToPElementsRe) == -1) {
                    dbg("Altering div to p");
                    Element createElement = this.document.createElement("P");
                    NodeList childNodes = element.getChildNodes();
                    for (int i2 = 0; i2 < childNodes.getLength(); i2++) {
                        createElement.appendChild(childNodes.item(i2));
                    }
                    element.getParentNode().replaceChild(createElement, element);
                    i--;
                } else {
                    int length = element.getChildNodes().getLength();
                    for (int i3 = 0; i3 < length; i3++) {
                        Node item = element.getChildNodes().item(i3);
                        if (item.getNodeType() == 3) {
                            dbg("replacing text node with a p tag with the same content.");
                            Element createElement2 = this.document.createElement("p");
                            createElement2.setNodeValue(item.getNodeValue());
                            createElement2.setTextContent(item.getTextContent());
                            createElement2.setAttribute("class", "readability-styled");
                            item.getParentNode().replaceChild(createElement2, item);
                        }
                    }
                }
            }
            i++;
        }
        ArrayList arrayList2 = new ArrayList();
        for (int i4 = 0; i4 < arrayList.size(); i4++) {
            Element element2 = (Element) ((Element) arrayList.get(i4)).getParentNode();
            Element element3 = (Element) element2.getParentNode();
            if (getInnerText((Element) arrayList.get(i4)).length() >= 25) {
                if (element2.getUserData("readability") == null) {
                    initializeNode(element2);
                    arrayList2.add(element2);
                }
                if (element3.getUserData("readability") == null) {
                    initializeNode(element3);
                    arrayList2.add(element3);
                }
                float length2 = (float) (0.0f + 1.0f + r0.split(",").length + Math.min(Math.floor(r0.length() / 100.0f), 3.0d));
                element2.setUserData("readability", Float.valueOf(((Float) element2.getUserData("readability")).floatValue() + length2), null);
                element3.setUserData("readability", Float.valueOf(((Float) element3.getUserData("readability")).floatValue() + (length2 / 2.0f)), null);
            }
        }
        Element element4 = null;
        int size = arrayList2.size();
        for (int i5 = 0; i5 < size; i5++) {
            ((Element) arrayList2.get(i5)).setUserData("readability", Float.valueOf(((Float) ((Element) arrayList2.get(i5)).getUserData("readability")).floatValue() * (1.0f - getLinkDensity((Element) arrayList2.get(i5)))), null);
            dbg("Candidate: " + arrayList2.get(i5) + " (" + ((Element) arrayList2.get(i5)).getAttribute("class") + ":" + ((Element) arrayList2.get(i5)).getAttribute("id") + ") with score " + ((Element) arrayList2.get(i5)).getUserData("readability"));
            if (element4 == null || ((Float) ((Element) arrayList2.get(i5)).getUserData("readability")).floatValue() > ((Float) element4.getUserData("readability")).floatValue()) {
                element4 = (Element) arrayList2.get(i5);
            }
        }
        if (element4 != null) {
            dbg("==> TOP Candidate: " + element4 + " (" + element4.getAttribute("class") + ":" + element4.getAttribute("id") + ") with score " + element4.getUserData("readability"));
        }
        if (element4 == null || element4.getTagName().equals("BODY")) {
            element4 = this.document.createElement("DIV");
            NodeList childNodes2 = getBody().getChildNodes();
            for (int i6 = 0; i6 < childNodes2.getLength(); i6++) {
                element4.appendChild(childNodes2.item(i6));
            }
            getBody().appendChild(element4);
            initializeNode(element4);
        }
        Element createElement3 = this.document.createElement("DIV");
        createElement3.setAttribute("id", "readability-content");
        float max = Math.max(10.0f, ((Float) element4.getUserData("readability")).floatValue() * 0.2f);
        NodeList childNodes3 = element4.getParentNode().getChildNodes();
        int i7 = 0;
        int length3 = childNodes3.getLength();
        while (i7 < length3) {
            Node item2 = childNodes3.item(i7);
            if (item2 instanceof Element) {
                dbg("Looking at sibling node: " + item2 + " (" + ((Element) item2).getAttribute("class") + ":" + ((Element) item2).getAttribute("id") + ")" + (item2.getUserData("readability") != null ? " with score " + item2.getUserData("readability") : ""));
            }
            dbg("Sibling has score " + (item2.getUserData("readability") != null ? item2.getUserData("readability") : "Unknown"));
            boolean z = item2 == element4;
            float f = 0.0f;
            if ((item2 instanceof Element) && ((Element) item2).getAttribute("class").equals(element4.getAttribute("class")) && !element4.getAttribute("class").equals("")) {
                f = 0.0f + (((Float) element4.getUserData("readability")).floatValue() * 0.2f);
            }
            if (item2.getUserData("readability") != null && ((Float) item2.getUserData("readability")).floatValue() + f >= max) {
                z = true;
            }
            if (item2.getNodeName().equals("P")) {
                float linkDensity = getLinkDensity((Element) item2);
                String innerText = getInnerText((Element) item2);
                int length4 = innerText.length();
                if (length4 > 80 && linkDensity < 0.25d) {
                    z = true;
                } else if (length4 < 80 && linkDensity == 0.0f && search(innerText, "\\.( |$)") != -1) {
                    z = true;
                }
            }
            if (z) {
                dbg("Appending node: " + item2);
                if (item2.getNodeName().equals("DIV") || item2.getNodeName().equals("P")) {
                    node = item2;
                    i7--;
                    length3--;
                } else {
                    dbg("Altering siblingNode of " + item2.getNodeName() + " to div.");
                    node = this.document.createElement("div");
                    if (item2 instanceof Element) {
                        ((Element) node).setAttribute("id", ((Element) item2).getAttribute("id"));
                    }
                    NodeList childNodes4 = item2.getChildNodes();
                    for (int i8 = 0; i8 < childNodes4.getLength(); i8++) {
                        node.appendChild(childNodes4.item(i8));
                    }
                }
                if (node instanceof Element) {
                    ((Element) node).setAttribute("class", "");
                }
                createElement3.appendChild(node);
            }
            i7++;
        }
        prepArticle(createElement3);
        return createElement3;
    }

    protected String getInnerHTML(Node node) {
        if (node.getNodeType() == 3) {
            return node.getTextContent();
        }
        String str = "";
        NodeList childNodes = node.getChildNodes();
        for (int i = 0; i < childNodes.getLength(); i++) {
            str = childNodes.item(i).getNodeType() == 3 ? str + childNodes.item(i).getTextContent() : childNodes.item(i).getNodeType() == 8 ? str + "<!-- " + childNodes.item(i).getTextContent() + " -->" : str + nodeToString(childNodes.item(i));
        }
        return str;
    }

    protected String nodeToString(Node node) {
        return nodeToString(node, false);
    }

    protected String nodeToString(Node node, boolean z) {
        try {
            LSSerializer createLSSerializer = ((DOMImplementationLS) DOMImplementationRegistry.newInstance().getDOMImplementation("LS")).createLSSerializer();
            createLSSerializer.getDomConfig().setParameter("xml-declaration", false);
            if (z) {
                createLSSerializer.getDomConfig().setParameter("format-pretty-print", true);
            }
            return createLSSerializer.writeToString(node);
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    protected Node stringToNode(String str) {
        try {
            DOMFragmentParser dOMFragmentParser = new DOMFragmentParser();
            DocumentFragment createDocumentFragment = this.document.createDocumentFragment();
            dOMFragmentParser.parse(new InputSource(new StringReader(str)), createDocumentFragment);
            return createDocumentFragment;
        } catch (Exception e) {
            throw new RuntimeException(e);
        }
    }

    protected String getInnerText(Element element, boolean z) {
        String replaceAll = element.getTextContent().replaceAll(Regexps.trimRe, "");
        return z ? replaceAll.replaceAll(Regexps.normalizeRe, " ") : replaceAll;
    }

    protected String getInnerTextSep(Node node) {
        if (!node.hasChildNodes()) {
            return node.getTextContent() + " ";
        }
        String str = "";
        NodeList childNodes = node.getChildNodes();
        for (int i = 0; i < childNodes.getLength(); i++) {
            if (!childNodes.item(i).getNodeName().equalsIgnoreCase("script")) {
                str = str + getInnerTextSep(childNodes.item(i));
            }
        }
        return str;
    }

    protected String getInnerText(Element element) {
        return getInnerText(element, true);
    }

    public String getArticleHTML() {
        return this.articleContent == null ? "" : nodeToString(this.articleContent, true);
    }

    public Node getArticleHTML_DOM() {
        return this.articleContent;
    }

    protected String getArticleDateString() {
        return this.article_date_string;
    }

    public Date getArticleDate() {
        return this.article_date;
    }

    public String getArticleText() {
        return this.articleContent == null ? "Unable to find article content" : this.articleContent.getTextContent().trim().replaceAll("[\r|\n|\r\n]{2,}", "\n\n").replaceAll(" {2,}", " ");
    }

    public List<Anchor> getArticleLinks() {
        ArrayList arrayList = new ArrayList();
        if (this.articleContent == null) {
            return arrayList;
        }
        NodeList elementsByTagName = this.articleContent.getElementsByTagName("a");
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            Element element = (Element) elementsByTagName.item(i);
            arrayList.add(new Anchor(getInnerText(element), element.getAttribute("href")));
        }
        return arrayList;
    }

    public List<Anchor> getAllLinks() {
        ArrayList arrayList = new ArrayList();
        NodeList elementsByTagName = this.document.getElementsByTagName("a");
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            Element element = (Element) elementsByTagName.item(i);
            arrayList.add(new Anchor(getInnerText(element), element.getAttribute("href")));
        }
        return arrayList;
    }

    public List<String> getArticleImages() {
        ArrayList arrayList = new ArrayList();
        if (this.articleContent == null) {
            return arrayList;
        }
        NodeList elementsByTagName = this.articleContent.getElementsByTagName("img");
        for (int i = 0; i < elementsByTagName.getLength(); i++) {
            arrayList.add(((Element) elementsByTagName.item(i)).getAttribute("src"));
        }
        return arrayList;
    }

    public List<String> getArticleSubheadings() {
        ArrayList arrayList = new ArrayList();
        if (this.articleContent == null) {
            return arrayList;
        }
        int i = 1;
        while (true) {
            if (i > 6) {
                break;
            }
            NodeList elementsByTagName = this.articleContent.getElementsByTagName("h" + i);
            if (elementsByTagName.getLength() > 0) {
                for (int i2 = 0; i2 < elementsByTagName.getLength(); i2++) {
                    arrayList.add(elementsByTagName.item(i2).getTextContent());
                }
            } else {
                i++;
            }
        }
        if (arrayList.size() == 0) {
            NodeList elementsByTagName2 = this.articleContent.getElementsByTagName("*");
            for (int i3 = 0; i3 < elementsByTagName2.getLength(); i3++) {
                if ((elementsByTagName2.item(i3) instanceof Element) && ((Element) elementsByTagName2.item(i3)).getAttribute("class") != null && search(((Element) elementsByTagName2.item(i3)).getAttribute("class"), Regexps.likelySubheadCandidateRe) != -1) {
                    arrayList.add(elementsByTagName2.item(i3).getTextContent());
                }
            }
        }
        return arrayList;
    }

    protected List<Node> findChildNodesWithName(Node node, String str) {
        String nodeName;
        NodeList childNodes = node.getChildNodes();
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < childNodes.getLength(); i++) {
            Node item = childNodes.item(i);
            if (item != null && (nodeName = item.getNodeName()) != null && nodeName.equals(str)) {
                arrayList.add(item);
            }
        }
        return arrayList;
    }

    protected int findChildNodeIndex(Node node, Node node2) {
        for (int i = 0; i < node.getChildNodes().getLength(); i++) {
            if (node.getChildNodes().item(i) == node2) {
                return i;
            }
        }
        return -1;
    }

    protected void getArticleTextMapping(TreeWalker treeWalker, List<MappingNode> list) throws DOMException {
        int findChildNodeIndex;
        Node currentNode = treeWalker.getCurrentNode();
        if (currentNode.getNodeType() == 3 && currentNode.getParentNode().getAttributes().getNamedItem("id") != null && currentNode.getTextContent().trim().length() > 0 && (findChildNodeIndex = findChildNodeIndex(currentNode.getParentNode(), currentNode)) != -1) {
            list.add(new MappingNode(currentNode.getParentNode().getAttributes().getNamedItem("id").getNodeValue() + "[" + findChildNodeIndex + "]", currentNode.getNodeValue()));
        }
        Node firstChild = treeWalker.firstChild();
        while (firstChild != null) {
            getArticleTextMapping(treeWalker, list);
            firstChild = treeWalker.nextSibling();
        }
        treeWalker.setCurrentNode(currentNode);
    }

    public List<MappingNode> getArticleTextMapping() {
        if (this.articleContent == null) {
            return null;
        }
        ArrayList arrayList = new ArrayList();
        getArticleTextMapping(this.document.createTreeWalker(this.articleContent, 5, (NodeFilter) null, true), arrayList);
        return arrayList;
    }

    public static Readability getReadability(String str) throws SAXException, IOException {
        return getReadability(str, false);
    }

    public static Readability getReadability(String str, boolean z) throws SAXException, IOException {
        DOMParser dOMParser = new DOMParser();
        dOMParser.parse(new InputSource(new StringReader(str)));
        return new Readability(dOMParser.getDocument(), false, z);
    }

    public static void main(String[] strArr) throws Exception {
        URL url = new URL("http://news.bbc.co.uk/1/hi/politics/10362367.stm");
        DOMParser dOMParser = new DOMParser();
        dOMParser.parse(new InputSource(url.openStream()));
        Readability readability = new Readability(dOMParser.getDocument(), false, true);
        System.out.println(readability.getArticleText());
        System.out.println();
        System.out.println("***");
        System.out.println();
        Iterator<MappingNode> it = readability.getArticleTextMapping().iterator();
        while (it.hasNext()) {
            System.out.println(it.next());
        }
    }
}
