/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
import java.util.List;
import java.util.Properties;
import org.junit.Assert;
import org.junit.Test;

public class WordsToSentencesAnnotatorTest {
    private static final String[] dateLineTexts = new String[]{"<P>\nGAZA, Dec. 1 (Xinhua) -- Hamas will respect any Palestinian referendum on a\npeaceful settlement with Israel even if the agreement was against its agenda,\ndeposed Prime Minister of the Hamas government Ismail Haneya said Wednesday.\n</P>\n", "\nLOS ANGELES, Dec. 31 (Xinhua) -- Body", "\nCARBONDALE, United States, Dec. 13 (Xinhua) -- Body", "<P>\nBRISBANE, Australia, Jan. 1(Xinhua) -- Body.</P>", "\nRIO DE JANEIRO, Dec. 31 (Xinhua) -- Body", "\nPORT-AU-PRINCE, Jan. 1 (Xinhua) -- Body", "\nWASHINGTON, May 12 (AFP) -- Body", "\nPanama  City,  Sept. 8 (CNA) -- Body", "\nUNITED NATIONS, April 3 (Xinhua) -- The", "<P>\nSAN FRANCISCO - California\n</P>", "<P>\nRIO DE JANEIRO - Edward J. Snowden\n</P>", "<P>\nPARETS DEL VALL\u00c8S, Spain - From\n</P>"};
    private static final String[] dateLineTokens = new String[]{"GAZA , Dec. 1 -LRB- Xinhua -RRB- --", "LOS ANGELES , Dec. 31 -LRB- Xinhua -RRB- --", "CARBONDALE , United States , Dec. 13 -LRB- Xinhua -RRB- --", "BRISBANE , Australia , Jan. 1 -LRB- Xinhua -RRB- --", "RIO DE JANEIRO , Dec. 31 -LRB- Xinhua -RRB- --", "PORT-AU-PRINCE , Jan. 1 -LRB- Xinhua -RRB- --", "WASHINGTON , May 12 -LRB- AFP -RRB- --", "Panama City , Sept. 8 -LRB- CNA -RRB- --", "UNITED NATIONS , April 3 -LRB- Xinhua -RRB- --", "SAN FRANCISCO -", "RIO DE JANEIRO -", "PARETS DEL VALL\u00c8S , Spain -"};
    private static final String[] dateLineSpanishTexts = new String[]{"<P>\n\nEL CAIRO, 30 jun (Xinhua) -- Al menos una persona.\n", "\nMONTEVIDEO, 1 jul (Xinhua) -- Los diarios uruguayos", "\nRIO DE JANEIRO, 30 jun (Xinhua) -- La selecci\u00f3n brasile\u00f1a", "\nSALVADOR DE BAHIA, Brasil, 30 jun (Xinhua) -- La selecci\u00f3n italiana", "\nLA HAYA, 31 dic (Xinhua) -- Dos candidatos holandeses", "\nJERUSALEN, 1 ene (Xinhua) -- El presidente de Israel", "\nCANBERRA (Xinhua) -- El calentamiento oce\u00e1nico"};
    private static final String[] dateLineSpanishTokens = new String[]{"EL CAIRO , 30 jun =LRB= Xinhua =RRB= --", "MONTEVIDEO , 1 jul =LRB= Xinhua =RRB= --", "RIO DE JANEIRO , 30 jun =LRB= Xinhua =RRB= --", "SALVADOR DE BAHIA , Brasil , 30 jun =LRB= Xinhua =RRB= --", "LA HAYA , 31 dic =LRB= Xinhua =RRB= --", "JERUSALEN , 1 ene =LRB= Xinhua =RRB= --", "CANBERRA =LRB= Xinhua =RRB= --"};
    private static final String kbpDocument = "<DOC    id=\"ENG_NW_001278_20130413_F00012OVI\">\n<DATE_TIME>2013-04-13T04:49:26</DATE_TIME>\n<HEADLINE>\nUrgent: powerful quake jolts western Japan\n</HEADLINE>\n<AUTHOR>\u9a6c\u5174\u534e</AUTHOR>\n<TEXT>\nUrgent: powerful quake jolts western Japan\n\nUrgent: powerful quake jolts western Japan\n\nOSAKA, April 13 (Xinhua) -- A powerful earthquake stroke a wide area in Japan's Kinki region in western Japan early Saturday. The quake was strongly felt in Osaka. Enditem\n</TEXT>\n</DOC>\n";
    private static final String[] kbpSentences = new String[]{"Urgent : powerful quake jolts western Japan", "Urgent : powerful quake jolts western Japan", "Urgent : powerful quake jolts western Japan", "OSAKA , April 13 -LRB- Xinhua -RRB- --", "A powerful earthquake stroke a wide area in Japan 's Kinki region in western Japan early Saturday .", "The quake was strongly felt in Osaka .", "Enditem"};
    private static final String kbpSpanishDocument = "<DOC    id=\"SPA_NW_001278_20130701_F00013T62\">\n<DATE_TIME>2013-07-01T03:06:44</DATE_TIME>\n<HEADLINE>\nMuere una persona y 37 resultan heridas en manifestaci\u00f3n contra presidente egipcio\n</HEADLINE>\n<AUTHOR/>\n<TEXT>\nMuere una persona y 37 resultan heridas en manifestaci\u00f3n contra presidente egipcio\n\nEL CAIRO, 30 jun (Xinhua) -- Al menos una persona muri\u00f3 y 37 resultaron heridas hoy en un ataque armado lanzado en una protesta contra el presidente de Egipto, Mohamed Morsi, en Beni Suef, al sur de la capital egipcia de El Cairo, inform\u00f3 la agencia estatal de noticias MENA. Fin\n</TEXT>\n</DOC>\n";
    private static final String[] kbpSpanishSentences = new String[]{"Muere una persona y 37 resultan heridas en manifestaci\u00f3n contra presidente egipcio", "Muere una persona y 37 resultan heridas en manifestaci\u00f3n contra presidente egipcio", "EL CAIRO , 30 jun =LRB= Xinhua =RRB= --", "Al menos una persona muri\u00f3 y 37 resultaron heridas hoy en un ataque armado lanzado en una protesta contra el presidente de Egipto , Mohamed Morsi , en Beni Suef , al sur de la capital egipcia de El Cairo , inform\u00f3 la agencia estatal de noticias MENA .", "Fin"};

    @Test
    public void testAnnotator() {
        String text = "I saw Dr. Spock yesterday, he was speaking with Mr. McCoy.  They were walking down Mullholand Dr. talking about www.google.com.  Dr. Spock returns!";
        WordsToSentencesAnnotatorTest.runSentence(text, 3);
        text = "I visited Google Research.  Dr. Spock, Ph.D., was working there and said it's an awful place!  What a waste of Ms. Pacman's last remaining life. Indeed";
        WordsToSentencesAnnotatorTest.runSentence(text, 4);
    }

    private static void runSentence(String text, int num_sentences) {
        Annotation doc = new Annotation(text);
        Properties props = PropertiesUtils.asProperties("annotators", "tokenize,ssplit", "tokenize.language", "en");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        pipeline.annotate(doc);
        List sentences = (List)doc.get(CoreAnnotations.SentencesAnnotation.class);
        Assert.assertNotNull((Object)sentences);
        Assert.assertEquals((long)num_sentences, (long)sentences.size());
    }

    @Test
    public void testSentenceSplitting() {
        String text = "Date :\n01/02/2012\nContent :\nSome words are here .\n";
        Properties props = PropertiesUtils.asProperties("annotators", "tokenize, ssplit", "ssplit.eolonly", "true", "tokenize.whitespace", "true");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Annotation document1 = new Annotation(text);
        pipeline.annotate(document1);
        List sentences = (List)document1.get(CoreAnnotations.SentencesAnnotation.class);
        Assert.assertEquals((long)4L, (long)sentences.size());
    }

    @Test
    public void testTokenizeNLsDoesntChangeSsplitResults() {
        String text = "This is one sentence\n\nThis is not another with default ssplit settings.";
        Properties props = PropertiesUtils.asProperties("annotators", "tokenize, ssplit", "tokenize.options", "tokenizeNLs");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Annotation document1 = new Annotation(text);
        pipeline.annotate(document1);
        List sentences = (List)document1.get(CoreAnnotations.SentencesAnnotation.class);
        Assert.assertEquals((long)1L, (long)sentences.size());
        List tokens = (List)document1.get(CoreAnnotations.TokensAnnotation.class);
        Assert.assertEquals((long)13L, (long)tokens.size());
    }

    @Test
    public void testDefaultNewlineIsSentenceBreakSettings() {
        String text = "This is one sentence\n\nThis is not another with default ssplit settings.";
        Properties props = PropertiesUtils.asProperties("annotators", "tokenize, ssplit");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Annotation document1 = new Annotation(text);
        pipeline.annotate(document1);
        List sentences = (List)document1.get(CoreAnnotations.SentencesAnnotation.class);
        Assert.assertEquals((long)1L, (long)sentences.size());
        List tokens = (List)document1.get(CoreAnnotations.TokensAnnotation.class);
        Assert.assertEquals((long)13L, (long)tokens.size());
    }

    @Test
    public void testTwoNewlineIsSentenceBreakSettings() {
        String text = "This is \none sentence\n\nThis is not another.";
        Properties props = PropertiesUtils.asProperties("annotators", "tokenize, ssplit", "ssplit.newlineIsSentenceBreak", "two");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Annotation document1 = new Annotation(text);
        pipeline.annotate(document1);
        List sentences = (List)document1.get(CoreAnnotations.SentencesAnnotation.class);
        Assert.assertEquals((long)2L, (long)sentences.size());
        List tokens = (List)document1.get(CoreAnnotations.TokensAnnotation.class);
        Assert.assertEquals((long)9L, (long)tokens.size());
    }

    @Test
    public void testTwoNewlineIsSentenceBreakTokenizeNLs() {
        String text = "This is \none sentence\n\nThis is not another.";
        Properties props = PropertiesUtils.asProperties("annotators", "tokenize, ssplit", "tokenize.language", "en", "tokenize.options", "tokenizeNLs,invertible,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Annotation document1 = new Annotation(text);
        pipeline.annotate(document1);
        List sentences = (List)document1.get(CoreAnnotations.SentencesAnnotation.class);
        Assert.assertEquals((long)2L, (long)sentences.size());
        List tokens = (List)document1.get(CoreAnnotations.TokensAnnotation.class);
        Assert.assertEquals((long)9L, (long)tokens.size());
        List sentenceTwoTokens = (List)((CoreMap)sentences.get(1)).get(CoreAnnotations.TokensAnnotation.class);
        String sentenceTwo = SentenceUtils.listToString(sentenceTwoTokens);
        Assert.assertEquals((String)"Bad tokens in sentence", (Object)"This is not another .", (Object)sentenceTwo);
    }

    @Test
    public void testAlwaysNewlineIsSentenceBreakSettings() {
        String text = "This is \none sentence\n\nThis is not another.";
        String[] sents = new String[]{"This is", "one sentence", "This is not another ."};
        Properties props = PropertiesUtils.asProperties("annotators", "tokenize, ssplit", "ssplit.newlineIsSentenceBreak", "always");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Annotation document1 = new Annotation(text);
        pipeline.annotate(document1);
        List sentences = (List)document1.get(CoreAnnotations.SentencesAnnotation.class);
        Assert.assertEquals((long)3L, (long)sentences.size());
        List tokens = (List)document1.get(CoreAnnotations.TokensAnnotation.class);
        Assert.assertEquals((long)9L, (long)tokens.size());
        for (int i = 0; i < Math.min(sents.length, sentences.size()); ++i) {
            CoreMap sentence = (CoreMap)sentences.get(i);
            String sentenceText = SentenceUtils.listToString((List)sentence.get(CoreAnnotations.TokensAnnotation.class));
            Assert.assertEquals((String)("Bad sentence #" + i), (Object)sents[i], (Object)sentenceText);
        }
    }

    @Test
    public void testDatelineSeparation() {
        Properties props = PropertiesUtils.asProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "en", "ssplit.newlineIsSentenceBreak", "two", "ssplit.boundaryMultiTokenRegex", "( /\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ /,/ ( /[-\\p{L}]+/+ /,/ )? /\\p{Lu}\\p{Ll}{2,5}\\.?/ /[1-3]?[0-9]/ /-LRB-/ /\\p{Lu}\\p{L}+/ /-RRB-/ /--/ | /\\*NL\\*/ /\\p{Lu}[-\\p{Lu}]+/+ ( /,/ /[-\\p{L}]+/+ )? /-/ )");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Assert.assertEquals((String)"Bad test data", (long)dateLineTexts.length, (long)dateLineTokens.length);
        for (int i = 0; i < dateLineTexts.length; ++i) {
            Annotation document1 = new Annotation(dateLineTexts[i]);
            pipeline.annotate(document1);
            List sentences = (List)document1.get(CoreAnnotations.SentencesAnnotation.class);
            Assert.assertEquals((String)("For " + dateLineTexts[i] + " annotation is " + document1), (long)2L, (long)sentences.size());
            List sentenceOneTokens = (List)((CoreMap)sentences.get(0)).get(CoreAnnotations.TokensAnnotation.class);
            String sentenceOne = SentenceUtils.listToString(sentenceOneTokens);
            Assert.assertEquals((String)"Bad tokens in dateline", (Object)dateLineTokens[i], (Object)sentenceOne);
        }
    }

    @Test
    public void testSpanishDatelineSeparation() {
        Properties props = PropertiesUtils.asProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "es", "tokenize.options", "tokenizeNLs,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two", "ssplit.boundaryMultiTokenRegex", "/\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ ( /,/  /[-\\p{L}]+/+ )? ( /,/ /[1-3]?[0-9]/ /\\p{Ll}{3,3}/ )? /=LRB=/ /\\p{Lu}\\p{L}+/ /=RRB=/ /--/");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Assert.assertEquals((String)"Bad test data", (long)dateLineSpanishTexts.length, (long)dateLineSpanishTokens.length);
        for (int i = 0; i < dateLineSpanishTexts.length; ++i) {
            Annotation document1 = new Annotation(dateLineSpanishTexts[i]);
            pipeline.annotate(document1);
            List sentences = (List)document1.get(CoreAnnotations.SentencesAnnotation.class);
            Assert.assertEquals((String)("For " + dateLineSpanishTexts[i] + " annotation is " + document1), (long)2L, (long)sentences.size());
            List sentenceOneTokens = (List)((CoreMap)sentences.get(0)).get(CoreAnnotations.TokensAnnotation.class);
            String sentenceOne = SentenceUtils.listToString(sentenceOneTokens);
            Assert.assertEquals((String)"Bad tokens in dateline", (Object)dateLineSpanishTokens[i], (Object)sentenceOne);
        }
    }

    @Test
    public void testKbpWorks() {
        Properties props = PropertiesUtils.asProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "en", "tokenize.options", "tokenizeNLs,invertible,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two", "ssplit.tokenPatternsToDiscard", "\\n,\\*NL\\*", "ssplit.boundaryMultiTokenRegex", "( /\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ /,/ ( /[-\\p{L}]+/+ /,/ )? /\\p{Lu}\\p{Ll}{2,5}\\.?/ /[1-3]?[0-9]/ /-LRB-/ /\\p{Lu}\\p{L}+/ /-RRB-/ /--/ | /\\*NL\\*/ /\\p{Lu}[-\\p{Lu}]+/+ ( /,/ /[-\\p{L}]+/+ )? /-/ )", "clean.xmltags", "headline|dateline|text|post", "clean.singlesentencetags", "HEADLINE|DATELINE|SPEAKER|POSTER|POSTDATE", "clean.sentenceendingtags", "P|POST|QUOTE", "clean.turntags", "TURN|POST|QUOTE", "clean.speakertags", "SPEAKER|POSTER", "clean.docidtags", "DOCID", "clean.datetags", "DATETIME|DATE|DATELINE", "clean.doctypetags", "DOCTYPE", "clean.docAnnotations", "docID=doc[id],doctype=doc[type],docsourcetype=doctype[source]", "clean.sectiontags", "HEADLINE|DATELINE|POST", "clean.sectionAnnotations", "sectionID=post[id],sectionDate=post[date|datetime],sectionDate=postdate,author=post[author],author=poster", "clean.quotetags", "quote", "clean.quoteauthorattributes", "orig_author", "clean.tokenAnnotations", "link=a[href],speaker=post[author],speaker=quote[orig_author]");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Annotation document1 = new Annotation(kbpDocument);
        pipeline.annotate(document1);
        List sentences = (List)document1.get(CoreAnnotations.SentencesAnnotation.class);
        for (int i = 0; i < Math.min(kbpSentences.length, sentences.size()); ++i) {
            CoreMap sentence = (CoreMap)sentences.get(i);
            String sentenceText = SentenceUtils.listToString((List)sentence.get(CoreAnnotations.TokensAnnotation.class));
            Assert.assertEquals((String)("Bad sentence #" + i), (Object)kbpSentences[i], (Object)sentenceText);
        }
        Assert.assertEquals((String)"Bad total number of sentences", (long)kbpSentences.length, (long)sentences.size());
    }

    @Test
    public void testKbpSpanishWorks() {
        Properties props = PropertiesUtils.asProperties("annotators", "tokenize, cleanxml, ssplit", "tokenize.language", "es", "tokenize.options", "tokenizeNLs,ptb3Escaping=true", "ssplit.newlineIsSentenceBreak", "two", "ssplit.tokenPatternsToDiscard", "\\n,\\*NL\\*", "ssplit.boundaryMultiTokenRegex", "/\\*NL\\*/ /\\p{Lu}[-\\p{L}]+/+ /,/ ( /[-\\p{L}]+/+ /,/ )? /[1-3]?[0-9]/ /\\p{Ll}{3,5}/ /=LRB=/ /\\p{Lu}\\p{L}+/ /=RRB=/ /--/", "clean.xmltags", "headline|text|post", "clean.singlesentencetags", "HEADLINE|AUTHOR", "clean.sentenceendingtags", "TEXT|POST|QUOTE", "clean.turntags", "POST|QUOTE", "clean.speakertags", "AUTHOR", "clean.datetags", "DATE_TIME", "clean.doctypetags", "DOC", "clean.docAnnotations", "docID=doc[id]", "clean.sectiontags", "HEADLINE|POST", "clean.sectionAnnotations", "sectionID=post[id],sectionDate=post[datetime],author=post[author]", "clean.quotetags", "quote", "clean.quoteauthorattributes", "orig_author", "clean.tokenAnnotations", "link=a[href],speaker=post[author],speaker=quote[orig_author]");
        StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
        Annotation document1 = new Annotation(kbpSpanishDocument);
        pipeline.annotate(document1);
        List sentences = (List)document1.get(CoreAnnotations.SentencesAnnotation.class);
        for (int i = 0; i < Math.min(kbpSpanishSentences.length, sentences.size()); ++i) {
            CoreMap sentence = (CoreMap)sentences.get(i);
            String sentenceText = SentenceUtils.listToString((List)sentence.get(CoreAnnotations.TokensAnnotation.class));
            Assert.assertEquals((String)("Bad sentence #" + i), (Object)kbpSpanishSentences[i], (Object)sentenceText);
        }
        Assert.assertEquals((String)"Bad total number of sentences", (long)kbpSpanishSentences.length, (long)sentences.size());
    }
}

