/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.ie.AbstractSequenceClassifier;
import edu.stanford.nlp.ie.crf.CRFClassifier;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.SegmenterCoreAnnotations;
import edu.stanford.nlp.ling.SentenceUtils;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.PropertiesUtils;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.logging.Redwood;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class ChineseSegmenterAnnotator
implements Annotator {
    private static final Redwood.RedwoodChannels log = Redwood.channels(ChineseSegmenterAnnotator.class);
    private static final String DEFAULT_MODEL_NAME = "segment";
    private static final String DEFAULT_SEG_LOC = "/u/nlp/data/chinese-segmenter/stanford-seg-2010/classifiers-2013/ctb7.chris6.lex.gz";
    private static final String DEFAULT_SER_DICTIONARY = "//u/nlp/data/chinese-segmenter/stanford-seg-2010/classifiers-2013/dict-chris6.ser.gz";
    private static final String DEFAULT_SIGHAN_CORPORA_DICT = "/u/nlp/data/chinese-segmenter/stanford-seg-2010/releasedata/";
    private static final String separator = "(?:\r|\r?\n|" + System.lineSeparator() + ')';
    private static final Pattern separatorPattern = Pattern.compile(separator);
    private final AbstractSequenceClassifier<?> segmenter;
    private final boolean VERBOSE;
    private final boolean tokenizeNewline;
    private final boolean sentenceSplitOnTwoNewlines;
    private final boolean normalizeSpace;
    private static final Pattern xmlPattern = Pattern.compile("<([!?][A-Za-z-][^>\r\n]*|[A-Za-z][A-Za-z0-9_:.-]*([ ]+([A-Za-z][A-Za-z0-9_:.-]*|[A-Za-z][A-Za-z0-9_:.-]*[ ]*=[ ]*('[^'\r\n]*'|\"[^\"\r\n]*\"|[A-Za-z][A-Za-z0-9_:.-]*)))*[ ]*/?|/[A-Za-z][A-Za-z0-9_:.-]*)[ ]*>");

    public ChineseSegmenterAnnotator() {
        this(DEFAULT_SEG_LOC, false);
    }

    public ChineseSegmenterAnnotator(String segLoc, boolean verbose) {
        this(segLoc, verbose, DEFAULT_SER_DICTIONARY, DEFAULT_SIGHAN_CORPORA_DICT);
    }

    public ChineseSegmenterAnnotator(String segLoc, boolean verbose, String serDictionary, String sighanCorporaDict) {
        this(DEFAULT_MODEL_NAME, PropertiesUtils.asProperties("segment.serDictionary", serDictionary, "segment.sighanCorporaDict", sighanCorporaDict, "segment.verbose", Boolean.toString(verbose), "segment.model", segLoc));
    }

    public ChineseSegmenterAnnotator(String name, Properties props) {
        String model = null;
        Properties modelProps = new Properties();
        String desiredKey = name + '.';
        for (String key : props.stringPropertyNames()) {
            if (!key.startsWith(desiredKey)) continue;
            String modelKey = key.substring(desiredKey.length());
            if (modelKey.equals("model")) {
                model = props.getProperty(key);
                continue;
            }
            modelProps.setProperty(modelKey, props.getProperty(key));
        }
        this.VERBOSE = PropertiesUtils.getBool(props, name + ".verbose", false);
        this.normalizeSpace = PropertiesUtils.getBool(props, name + ".normalizeSpace", false);
        if (model == null) {
            throw new RuntimeException("Expected a property " + name + ".model");
        }
        if (this.VERBOSE) {
            log.info("Loading Segmentation Model ... ");
        }
        try {
            this.segmenter = CRFClassifier.getClassifier(model, modelProps);
        }
        catch (RuntimeException e) {
            throw e;
        }
        catch (Exception e) {
            throw new RuntimeException(e);
        }
        this.tokenizeNewline = !props.getProperty("ssplit.newlineIsSentenceBreak", "never").equals("never") || Boolean.valueOf(props.getProperty("ssplit.eolonly", "false")) != false;
        this.sentenceSplitOnTwoNewlines = props.getProperty("ssplit.newlineIsSentenceBreak", "never").equals("two");
    }

    @Override
    public void annotate(Annotation annotation) {
        List sentences;
        if (this.VERBOSE) {
            log.info("Adding Segmentation annotation ... ");
        }
        if ((sentences = (List)annotation.get(CoreAnnotations.SentencesAnnotation.class)) != null) {
            for (CoreMap sentence : sentences) {
                this.doOneSentence(sentence);
            }
        } else {
            this.doOneSentence(annotation);
        }
    }

    private void doOneSentence(CoreMap annotation) {
        this.splitCharacters(annotation);
        this.runSegmentation(annotation);
    }

    private void splitCharacters(CoreMap annotation) {
        int cpCharCount;
        int cpCharCount2;
        String origText = (String)annotation.get(CoreAnnotations.TextAnnotation.class);
        boolean seg = true;
        ArrayList<CoreLabel> charTokens = new ArrayList<CoreLabel>();
        int length = origText.length();
        int xmlStartOffset = Integer.MAX_VALUE;
        int xmlEndOffset = -1;
        Matcher m = xmlPattern.matcher(origText);
        if (m.find()) {
            xmlStartOffset = m.start();
            xmlEndOffset = m.end();
        }
        int firstNonNewlineOffset = -1;
        int lastNonNewlineOffset = length;
        for (int offset = 0; offset < length; offset += cpCharCount2) {
            int cp = origText.codePointAt(offset);
            cpCharCount2 = Character.charCount(cp);
            String charString = origText.substring(offset, offset + cpCharCount2);
            if (firstNonNewlineOffset == -1 && cp != 10 && cp != 13 && !System.lineSeparator().contains(charString)) {
                firstNonNewlineOffset = offset;
            }
            if (cp == 10 || cp == 13 || System.lineSeparator().contains(charString)) continue;
            lastNonNewlineOffset = offset;
        }
        LinkedList<Boolean> isNewlineQueue = new LinkedList<Boolean>();
        isNewlineQueue.addAll(Arrays.asList(false));
        for (int offset = 0; offset < length; offset += cpCharCount) {
            int nextOffset;
            int cp = origText.codePointAt(offset);
            cpCharCount = Character.charCount(cp);
            CoreLabel wi = new CoreLabel();
            String charString = origText.substring(offset, offset + cpCharCount);
            if (offset == xmlEndOffset && (m = xmlPattern.matcher(origText)).find(offset)) {
                xmlStartOffset = m.start();
                xmlEndOffset = m.end();
            }
            if (offset == 0) {
                isNewlineQueue.add(cp == 10);
            }
            if ((nextOffset = offset + cpCharCount) < origText.length()) {
                int nextCodePoint = origText.codePointAt(nextOffset);
                isNewlineQueue.add(nextCodePoint == 10);
            } else {
                isNewlineQueue.add(false);
            }
            boolean skipCharacter = false;
            boolean isXMLCharacter = false;
            if (offset == xmlStartOffset) {
                seg = true;
                isXMLCharacter = true;
            } else if (offset > xmlStartOffset && offset < xmlEndOffset) {
                seg = false;
                isXMLCharacter = true;
            } else if (Character.isSpaceChar(cp) || Character.isISOControl(cp)) {
                seg = true;
                boolean prevIsNewline = (Boolean)isNewlineQueue.get(0);
                boolean currIsNewline = (Boolean)isNewlineQueue.get(1);
                boolean nextIsNewline = (Boolean)isNewlineQueue.get(2);
                boolean isLeadingOrTrailingNewline = offset < firstNonNewlineOffset || offset > lastNonNewlineOffset;
                boolean isSingleNewlineInMiddle = currIsNewline && !prevIsNewline && !nextIsNewline;
                boolean bl = skipCharacter = !this.tokenizeNewline || !currIsNewline;
                if (isLeadingOrTrailingNewline) {
                    skipCharacter = true;
                }
                if (this.sentenceSplitOnTwoNewlines && isSingleNewlineInMiddle) {
                    skipCharacter = true;
                }
            }
            if (!skipCharacter) {
                wi.set(CoreAnnotations.ChineseCharAnnotation.class, charString);
                if (seg) {
                    wi.set(CoreAnnotations.ChineseSegAnnotation.class, "1");
                } else {
                    wi.set(CoreAnnotations.ChineseSegAnnotation.class, "0");
                }
                if (isXMLCharacter) {
                    if (Character.isSpaceChar(cp) || Character.isISOControl(cp)) {
                        wi.set(SegmenterCoreAnnotations.XMLCharAnnotation.class, "whitespace");
                    } else if (offset == xmlStartOffset) {
                        wi.set(SegmenterCoreAnnotations.XMLCharAnnotation.class, "beginning");
                    } else {
                        wi.set(SegmenterCoreAnnotations.XMLCharAnnotation.class, "1");
                    }
                } else {
                    wi.set(SegmenterCoreAnnotations.XMLCharAnnotation.class, "0");
                }
                wi.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, offset);
                wi.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, offset + cpCharCount);
                charTokens.add(wi);
                seg = false;
            }
            isNewlineQueue.poll();
        }
        annotation.set(SegmenterCoreAnnotations.CharactersAnnotation.class, charTokens);
    }

    private static int advancePos(List<CoreLabel> sentChars, int pos, String w) {
        StringBuilder sb = new StringBuilder();
        while (!w.equals(sb.toString())) {
            sb.append((String)sentChars.get(pos).get(CoreAnnotations.ChineseCharAnnotation.class));
            ++pos;
        }
        return pos;
    }

    private void runSegmentation(CoreMap annotation) {
        List<String> words;
        String text = (String)annotation.get(CoreAnnotations.TextAnnotation.class);
        List sentChars = (List)annotation.get(SegmenterCoreAnnotations.CharactersAnnotation.class);
        if (this.VERBOSE) {
            log.info("sentChars (length " + sentChars.size() + ") is " + SentenceUtils.listToString(sentChars, StringUtils.EMPTY_STRING_ARRAY));
        }
        ArrayList<CoreLabel> tokens = new ArrayList<CoreLabel>();
        annotation.set(CoreAnnotations.TokensAnnotation.class, tokens);
        if (!this.tokenizeNewline) {
            text = text.replaceAll("[\r\n]", "");
            words = this.segmenter.segmentString(text);
        } else {
            text = text.replaceAll("^[\\r\\n]+", "");
            text = text.replaceAll("[\\r\\n]+$", "");
            if (this.sentenceSplitOnTwoNewlines) {
                text = text.replaceAll("([^\\n])\\r?\\n([^\\r\\n])", "$1$2");
                text = text.replaceAll("([^\\n])\\r?\\n([^\\r\\n])", "$1$2");
            }
            String[] lines = text.split(String.format("((?<=%1$s)|(?=%1$s))", separator));
            words = new ArrayList<String>();
            for (String line : lines) {
                if (separatorPattern.matcher(line).matches()) {
                    words.add(line);
                    continue;
                }
                words.addAll(this.segmenter.segmentString(line));
            }
        }
        if (this.VERBOSE) {
            log.info(text + "\n--->\n" + words + " (length " + words.size() + ')');
        }
        int pos = 0;
        StringBuilder xmlBuffer = new StringBuilder();
        int xmlBegin = -1;
        for (String w : words) {
            CoreLabel fl = (CoreLabel)sentChars.get(pos);
            String xmlCharAnnotation = (String)fl.get(SegmenterCoreAnnotations.XMLCharAnnotation.class);
            if (this.VERBOSE) {
                log.info("Working on word " + w + ", sentChar " + fl.toShorterString(new String[0]) + " (sentChars index " + pos + ')');
            }
            if (("0".equals(xmlCharAnnotation) || "beginning".equals(xmlCharAnnotation)) && xmlBuffer.length() > 0) {
                String xmlTag = xmlBuffer.toString();
                CoreLabel fl1 = (CoreLabel)sentChars.get(pos - 1);
                int end = (Integer)fl1.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
                tokens.add(this.makeXmlToken(xmlTag, true, xmlBegin, end));
                xmlBegin = -1;
                xmlBuffer = new StringBuilder();
            }
            if (!"0".equals(xmlCharAnnotation)) {
                while (((String)fl.get(SegmenterCoreAnnotations.XMLCharAnnotation.class)).equals("whitespace")) {
                    xmlBuffer.append(' ');
                    fl = (CoreLabel)sentChars.get(++pos);
                }
                xmlBuffer.append(w);
                pos = ChineseSegmenterAnnotator.advancePos(sentChars, pos, w);
                if (xmlBegin >= 0) continue;
                xmlBegin = (Integer)fl.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
                continue;
            }
            fl.set(CoreAnnotations.ChineseSegAnnotation.class, "1");
            if (w.isEmpty()) {
                if (!this.VERBOSE) continue;
                log.warn("Encountered an empty word. Shouldn't happen?");
                continue;
            }
            int begin = (Integer)fl.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
            if ((pos = ChineseSegmenterAnnotator.advancePos(sentChars, pos, w)) - 1 >= sentChars.size()) {
                log.error("Error: on word " + w + " at position " + (pos - w.length()) + " trying to get at position " + (pos - 1));
                log.error("last element of sentChars is " + sentChars.get(sentChars.size() - 1));
                continue;
            }
            fl = (CoreLabel)sentChars.get(pos - 1);
            int end = (Integer)fl.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
            tokens.add(this.makeXmlToken(w, false, begin, end));
        }
        if (xmlBuffer.length() > 0) {
            String xmlTag = xmlBuffer.toString();
            CoreLabel fl1 = (CoreLabel)sentChars.get(pos - 1);
            int end = (Integer)fl1.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
            tokens.add(this.makeXmlToken(xmlTag, true, xmlBegin, end));
        }
        if (this.VERBOSE) {
            for (CoreLabel token : tokens) {
                log.info(token.toShorterString(new String[0]));
            }
        }
    }

    private CoreLabel makeXmlToken(String tokenText, boolean doNormalization, int charOffsetBegin, int charOffsetEnd) {
        CoreLabel token = new CoreLabel();
        token.setOriginalText(tokenText);
        if (separatorPattern.matcher(tokenText).matches()) {
            tokenText = "*NL*";
        } else if (doNormalization && this.normalizeSpace) {
            tokenText = tokenText.replace(' ', '\u00a0');
        }
        token.setWord(tokenText);
        token.setValue(tokenText);
        token.set(CoreAnnotations.CharacterOffsetBeginAnnotation.class, charOffsetBegin);
        token.set(CoreAnnotations.CharacterOffsetEndAnnotation.class, charOffsetEnd);
        if (this.VERBOSE) {
            log.info("Adding token " + token.toShorterString(new String[0]));
        }
        return token;
    }

    @Override
    public Set<Class<? extends CoreAnnotation>> requires() {
        return Collections.emptySet();
    }

    @Override
    public Set<Class<? extends CoreAnnotation>> requirementsSatisfied() {
        return new HashSet<Class<? extends CoreAnnotation>>(Arrays.asList(CoreAnnotations.TextAnnotation.class, CoreAnnotations.TokensAnnotation.class, CoreAnnotations.CharacterOffsetBeginAnnotation.class, CoreAnnotations.CharacterOffsetEndAnnotation.class, CoreAnnotations.BeforeAnnotation.class, CoreAnnotations.AfterAnnotation.class, CoreAnnotations.TokenBeginAnnotation.class, CoreAnnotations.TokenEndAnnotation.class, CoreAnnotations.PositionAnnotation.class, CoreAnnotations.IndexAnnotation.class, CoreAnnotations.OriginalTextAnnotation.class, CoreAnnotations.ValueAnnotation.class));
    }
}

