Пример #1
0
 /**
  * Saves the singleton predictor model to the given filename. If there is an error, a
  * RuntimeIOException is thrown.
  */
 public void saveToSerialized(LogisticClassifier<String, String> predictor, String filename) {
   try {
     log.info("Writing singleton predictor in serialized format to file " + filename + ' ');
     ObjectOutputStream out = IOUtils.writeStreamFromString(filename);
     out.writeObject(predictor);
     out.close();
     log.info("done.");
   } catch (IOException ioe) {
     throw new RuntimeIOException(ioe);
   }
 }
Пример #2
0
 /** This hashCode uses only the docID, sentenceIndex, and index. See compareTo for more info. */
 @Override
 public int hashCode() {
   if (cachedHashCode != 0) {
     return cachedHashCode;
   }
   boolean sensible = false;
   int result = 0;
   if (get(CoreAnnotations.DocIDAnnotation.class) != null) {
     result = get(CoreAnnotations.DocIDAnnotation.class).hashCode();
     sensible = true;
   }
   if (containsKey(CoreAnnotations.SentenceIndexAnnotation.class)) {
     result = 29 * result + get(CoreAnnotations.SentenceIndexAnnotation.class).hashCode();
     sensible = true;
   }
   if (containsKey(CoreAnnotations.IndexAnnotation.class)) {
     result = 29 * result + get(CoreAnnotations.IndexAnnotation.class).hashCode();
     sensible = true;
   }
   if (!sensible) {
     log.info(
         "WARNING!!!  You have hashed an IndexedWord with no docID, sentIndex or wordIndex. You will almost certainly lose");
   }
   cachedHashCode = result;
   return result;
 }
Пример #3
0
  public static void main(String[] args) throws Exception {
    Properties props = null;
    if (args.length > 0) props = StringUtils.argsToProperties(args);
    if (!props.containsKey("dcoref.conll2011")) {
      log.info("-dcoref.conll2011 [input_CoNLL_corpus]: was not specified");
      return;
    }
    if (!props.containsKey("singleton.predictor.output")) {
      log.info("-singleton.predictor.output [output_model_file]: was not specified");
      return;
    }

    SingletonPredictor predictor = new SingletonPredictor();

    GeneralDataset<String, String> data = predictor.generateFeatureVectors(props);
    LogisticClassifier<String, String> classifier = predictor.train(data);
    predictor.saveToSerialized(classifier, props.getProperty("singleton.predictor.output"));
  }
Пример #4
0
 // should be able to pass in a comparator!
 protected static double precision(Set<?> s1, Set<?> s2) {
   double n = 0.0;
   double p = 0.0;
   for (Object o1 : s1) {
     if (s2.contains(o1)) {
       p += 1.0;
     }
     if (DEBUG) {
       if (s2.contains(o1)) {
         log.info("Eval Found: " + o1);
       } else {
         log.info("Eval Failed to find: " + o1);
       }
     }
     n += 1.0;
   }
   if (DEBUG) log.info("Matched " + p + " of " + n);
   return (n > 0.0 ? p / n : 0.0);
 }
Пример #5
0
  public Annotation process(String sentence, String dateString, Annotator timeAnnotator) {
    log.info("Processing text \"" + sentence + "\" with dateString = " + dateString);
    Annotation anno = new Annotation(sentence);
    if (dateString != null && !dateString.equals("")) {
      anno.set(CoreAnnotations.DocDateAnnotation.class, dateString);
    }
    pipeline.annotate(anno);

    timeAnnotator.annotate(anno);
    return anno;
  }
Пример #6
0
 public float accuracy(Iterator<RVFDatum<L, F>> exampleIterator) {
   int correct = 0;
   int total = 0;
   for (; exampleIterator.hasNext(); ) {
     RVFDatum<L, F> next = exampleIterator.next();
     L guess = classOf(next);
     if (guess.equals(next.label())) {
       correct++;
     }
     total++;
   }
   logger.info("correct " + correct + " out of " + total);
   return correct / (float) total;
 }
Пример #7
0
  private static Set<String> readDict(String filename, boolean normalize) {
    Set<String> word = Generics.newHashSet();

    logger.info(
        "Loading " + (normalize ? "normalized" : "unnormalized") + " dictionary from " + filename);

    try {
      InputStream is = IOUtils.getInputStreamFromURLOrClasspathOrFileSystem(filename);
      BufferedReader wordDetectorReader = new BufferedReader(new InputStreamReader(is, "UTF-8"));
      int i = 0;
      for (String wordDetectorLine; (wordDetectorLine = wordDetectorReader.readLine()) != null; ) {
        i++;
        // String[] fields = wordDetectorLine.split("	");
        // logger.debug("DEBUG: "+filename+" "+wordDetectorLine);
        int origLeng = wordDetectorLine.length();
        wordDetectorLine = wordDetectorLine.trim();
        int newLeng = wordDetectorLine.length();
        if (newLeng != origLeng) {
          EncodingPrintWriter.err.println(
              "Line "
                  + i
                  + " of "
                  + filename
                  + " has leading/trailing whitespace: |"
                  + wordDetectorLine
                  + "|",
              "UTF-8");
        }
        if (newLeng == 0) {
          EncodingPrintWriter.err.println("Line " + i + " of " + filename + " is empty", "UTF-8");
        } else {
          if (normalize) {
            wordDetectorLine =
                ChineseUtils.normalize(
                    wordDetectorLine,
                    ChineseUtils.ASCII,
                    ChineseUtils.ASCII,
                    ChineseUtils.NORMALIZE);
          }
          word.add(wordDetectorLine);
        }
      }
      is.close();
    } catch (IOException e) {
      throw new RuntimeIOException(e);
    }
    return word;
  }
Пример #8
0
  public static void main(String[] args) {
    if (args.length < minArgs) {
      System.out.println(usage());
      System.exit(-1);
    }

    Properties options = StringUtils.argsToProperties(args, argDefs());
    Language language = PropertiesUtils.get(options, "l", Language.English, Language.class);
    TreebankLangParserParams tlpp = language.params;
    DiskTreebank tb = null;
    String encoding = options.getProperty("l", "UTF-8");
    boolean removeBracket = PropertiesUtils.getBool(options, "b", false);

    tlpp.setInputEncoding(encoding);
    tlpp.setOutputEncoding(encoding);
    tb = tlpp.diskTreebank();

    String[] files = options.getProperty("", "").split("\\s+");
    if (files.length != 0) {
      for (String filename : files) {
        tb.loadPath(filename);
      }
    } else {
      log.info(usage());
      System.exit(-1);
    }

    PrintWriter pwo = tlpp.pw();
    String startSymbol = tlpp.treebankLanguagePack().startSymbol();
    TreeFactory tf = new LabeledScoredTreeFactory();
    int nTrees = 0;
    for (Tree t : tb) {
      if (removeBracket) {
        if (t.value().equals(startSymbol)) {
          t = t.firstChild();
        }

      } else if (!t.value().equals(startSymbol)) { // Add a bracket if it isn't already there
        t = tf.newTreeNode(startSymbol, Collections.singletonList(t));
      }
      pwo.println(t.toString());
      nTrees++;
    }
    pwo.close();
    System.err.printf("Processed %d trees.%n", nTrees);
  }
  /**
   * Creates a combined list of Entries using the provided mapping files.
   *
   * @param mappings List of mapping files
   * @return list of Entries
   */
  private static List<Entry> readEntries(
      String annotatorName,
      Set<String> noDefaultOverwriteLabels,
      boolean ignoreCase,
      boolean verbose,
      String... mappings) {
    // Unlike RegexNERClassifier, we don't bother sorting the entries
    // We leave it to TokensRegex NER to sort out the priorities and matches
    //   (typically after all the matches has been made since for some TokenRegex expression,
    //       we don't know how many tokens are matched until after the matching is done)
    List<Entry> entries = new ArrayList<>();
    TrieMap<String, Entry> seenRegexes = new TrieMap<>();
    Arrays.sort(mappings);
    for (String mapping : mappings) {
      BufferedReader rd = null;
      try {
        rd = IOUtils.readerFromString(mapping);
        readEntries(
            annotatorName,
            entries,
            seenRegexes,
            mapping,
            rd,
            noDefaultOverwriteLabels,
            ignoreCase,
            verbose);
      } catch (IOException e) {
        throw new RuntimeIOException("Couldn't read TokensRegexNER from " + mapping, e);
      } finally {
        IOUtils.closeIgnoringExceptions(rd);
      }
    }

    if (mappings.length != 1) {
      logger.log(
          "TokensRegexNERAnnotator "
              + annotatorName
              + ": Read "
              + entries.size()
              + " unique entries from "
              + mappings.length
              + " files");
    }
    return entries;
  }
Пример #10
0
 public static <L, F> OneVsAllClassifier<L, F> train(
     ClassifierFactory<String, F, Classifier<String, F>> classifierFactory,
     GeneralDataset<L, F> dataset,
     Collection<L> trainLabels) {
   Index<L> labelIndex = dataset.labelIndex();
   Index<F> featureIndex = dataset.featureIndex();
   Map<L, Classifier<String, F>> classifiers = Generics.newHashMap();
   for (L label : trainLabels) {
     int i = labelIndex.indexOf(label);
     logger.info("Training " + label + " = " + i + ", posIndex = " + posIndex);
     // Create training data for training this classifier
     Map<L, String> posLabelMap = new ArrayMap<>();
     posLabelMap.put(label, POS_LABEL);
     GeneralDataset<String, F> binaryDataset =
         dataset.mapDataset(dataset, binaryIndex, posLabelMap, NEG_LABEL);
     Classifier<String, F> binaryClassifier = classifierFactory.trainClassifier(binaryDataset);
     classifiers.put(label, binaryClassifier);
   }
   OneVsAllClassifier<L, F> classifier =
       new OneVsAllClassifier<>(featureIndex, labelIndex, classifiers);
   return classifier;
 }
  @Override
  public void initializeTraining(
      Options op, Lexicon lex, Index<String> wordIndex, Index<String> tagIndex, double totalTrees) {
    super.initializeTraining(op, lex, wordIndex, tagIndex, totalTrees);

    this.indexToStartUnkCounting = (totalTrees * op.trainOptions.fractionBeforeUnseenCounting);

    seenCounter = new ClassicCounter<>();
    unSeenCounter = new ClassicCounter<>();

    model = new EnglishUnknownWordModel(op, lex, wordIndex, tagIndex, unSeenCounter);

    // scan data
    if (DOCUMENT_UNKNOWNS) {
      log.info(
          "Collecting "
              + Lexicon.UNKNOWN_WORD
              + " from trees "
              + (indexToStartUnkCounting + 1)
              + " to "
              + totalTrees);
    }
  }
Пример #12
0
  public void evaluate(Tree guess, Tree gold, PrintWriter pw, double weight) {
    if (DEBUG) {
      log.info("Evaluating gold tree:");
      gold.pennPrint(System.err);
      log.info("and guess tree");
      guess.pennPrint(System.err);
    }
    Set<?> dep1 = makeObjects(guess);
    Set<?> dep2 = makeObjects(gold);
    final double curPrecision = precision(dep1, dep2);
    final double curRecall = precision(dep2, dep1);
    curF1 =
        (curPrecision > 0.0 && curRecall > 0.0
            ? 2.0 / (1.0 / curPrecision + 1.0 / curRecall)
            : 0.0);
    precision += curPrecision * weight;
    recall += curRecall * weight;
    f1 += curF1 * weight;
    num += weight;

    precision2 += dep1.size() * curPrecision * weight;
    pnum2 += dep1.size() * weight;

    recall2 += dep2.size() * curRecall * weight;
    rnum2 += dep2.size() * weight;

    if (curF1 > 0.9999) {
      exact += 1.0;
    }
    if (pw != null) {
      pw.print(" P: " + ((int) (curPrecision * 10000)) / 100.0);
      if (runningAverages) {
        pw.println(
            " (sent ave "
                + ((int) (precision * 10000 / num)) / 100.0
                + ") (evalb "
                + ((int) (precision2 * 10000 / pnum2)) / 100.0
                + ")");
      }
      pw.print(" R: " + ((int) (curRecall * 10000)) / 100.0);
      if (runningAverages) {
        pw.print(
            " (sent ave "
                + ((int) (recall * 10000 / num)) / 100.0
                + ") (evalb "
                + ((int) (recall2 * 10000 / rnum2)) / 100.0
                + ")");
      }
      pw.println();
      double cF1 = 2.0 / (rnum2 / recall2 + pnum2 / precision2);
      pw.print(str + " F1: " + ((int) (curF1 * 10000)) / 100.0);
      if (runningAverages) {
        pw.print(
            " (sent ave "
                + ((int) (10000 * f1 / num)) / 100.0
                + ", evalb "
                + ((int) (10000 * cF1)) / 100.0
                + ")   Exact: "
                + ((int) (10000 * exact / num)) / 100.0);
      }
      //      pw.println(" N: " + getNum());
      pw.println(" N: " + num);
    }
    /*
      Sentence s = guess.yield();
      for (Object obj : s) {
        if (curF1 < 0.7) {
          badwords.incrementCount(obj);
        } else {
          goodwords.incrementCount(obj);
        }
      }
    */
  }
Пример #13
0
  /**
   * Configure all parameters for converting a list of tokens into sentences. The whole enchilada.
   *
   * @param boundaryTokenRegex Tokens that match this regex will end a sentence, but are retained at
   *     the end of the sentence. Substantive value must be supplied.
   * @param boundaryFollowersRegex This is a Set of String that are matched with .equals() which are
   *     allowed to be tacked onto the end of a sentence after a sentence boundary token, for
   *     example ")". Substantive value must be supplied.
   * @param boundariesToDiscard This is normally used for newline tokens if they are included in the
   *     tokenization. They may end the sentence (depending on the setting of
   *     newlineIsSentenceBreak), but at any rate are deleted from sentences in the output.
   *     Substantive value must be supplied.
   * @param xmlBreakElementsToDiscard These are elements like "p" or "sent", which will be wrapped
   *     into regex for approximate XML matching. They will be deleted in the output, and will
   *     always trigger a sentence boundary. May be null; means discard none.
   * @param regionElementRegex XML element name regex to delimit regions processed. Tokens outside
   *     one of these elements are discarded. May be null; means to not filter by regions
   * @param newlineIsSentenceBreak How to treat newlines. Must have substantive value.
   * @param sentenceBoundaryMultiTokenPattern A TokensRegex multi-token pattern for finding
   *     boundaries. May be null; means that there are no such patterns.
   * @param tokenRegexesToDiscard Regex for tokens to discard. May be null; means that no tokens are
   *     discarded in this way.
   * @param isOneSentence Whether to treat whole of input as one sentence regardless. Must have
   *     substantive value. Overrides anything else.
   * @param allowEmptySentences Whether to allow empty sentences to be output Must have substantive
   *     value. Often suppressed, but don't want that in things like strict one-sentence-per-line
   *     mode.
   */
  public WordToSentenceProcessor(
      String boundaryTokenRegex,
      String boundaryFollowersRegex,
      Set<String> boundariesToDiscard,
      Set<String> xmlBreakElementsToDiscard,
      String regionElementRegex,
      NewlineIsSentenceBreak newlineIsSentenceBreak,
      SequencePattern<? super IN> sentenceBoundaryMultiTokenPattern,
      Set<String> tokenRegexesToDiscard,
      boolean isOneSentence,
      boolean allowEmptySentences) {
    sentenceBoundaryTokenPattern = Pattern.compile(boundaryTokenRegex);
    sentenceBoundaryFollowersPattern = Pattern.compile(boundaryFollowersRegex);
    sentenceBoundaryToDiscard = Collections.unmodifiableSet(boundariesToDiscard);
    if (xmlBreakElementsToDiscard == null || xmlBreakElementsToDiscard.isEmpty()) {
      this.xmlBreakElementsToDiscard = null;
    } else {
      this.xmlBreakElementsToDiscard = new ArrayList<>(xmlBreakElementsToDiscard.size());
      for (String s : xmlBreakElementsToDiscard) {
        String regex = "<\\s*(?:/\\s*)?(?:" + s + ")(?:\\s+[^>]+?|\\s*(?:/\\s*)?)>";
        // log.info("Regex is |" + regex + "|");
        // todo: Historically case insensitive, but maybe better and more proper to make case
        // sensitive?
        this.xmlBreakElementsToDiscard.add(Pattern.compile(regex, Pattern.CASE_INSENSITIVE));
      }
    }
    if (regionElementRegex != null) {
      sentenceRegionBeginPattern =
          Pattern.compile("<\\s*(?:" + regionElementRegex + ")(?:\\s+[^>]+?)?>");
      sentenceRegionEndPattern = Pattern.compile("<\\s*/\\s*(?:" + regionElementRegex + ")\\s*>");
    } else {
      sentenceRegionBeginPattern = null;
      sentenceRegionEndPattern = null;
    }
    this.newlineIsSentenceBreak = newlineIsSentenceBreak;
    this.sentenceBoundaryMultiTokenPattern = sentenceBoundaryMultiTokenPattern;
    if (tokenRegexesToDiscard != null) {
      this.tokenPatternsToDiscard = new ArrayList<>(tokenRegexesToDiscard.size());
      for (String s : tokenRegexesToDiscard) {
        this.tokenPatternsToDiscard.add(Pattern.compile(s));
      }
    } else {
      this.tokenPatternsToDiscard = null;
    }
    this.isOneSentence = isOneSentence;
    this.allowEmptySentences = allowEmptySentences;

    if (DEBUG) {
      log.info("WordToSentenceProcessor: boundaryTokens=" + boundaryTokenRegex);
      log.info("  boundaryFollowers=" + boundaryFollowersRegex);
      log.info("  boundariesToDiscard=" + boundariesToDiscard);
      log.info("  xmlBreakElementsToDiscard=" + xmlBreakElementsToDiscard);
      log.info("  regionBeginPattern=" + sentenceRegionBeginPattern);
      log.info("  regionEndPattern=" + sentenceRegionEndPattern);
      log.info("  newlineIsSentenceBreak=" + newlineIsSentenceBreak);
      log.info("  sentenceBoundaryMultiTokenPattern=" + sentenceBoundaryMultiTokenPattern);
      log.info("  tokenPatternsToDiscard=" + tokenPatternsToDiscard);
      log.info("  isOneSentence=" + isOneSentence);
      log.info("  allowEmptySentences=" + allowEmptySentences);
    }
  }
Пример #14
0
  /**
   * Returns a List of Lists where each element is built from a run of Words in the input Document.
   * Specifically, reads through each word in the input document and breaks off a sentence after
   * finding a valid sentence boundary token or end of file. Note that for this to work, the words
   * in the input document must have been tokenized with a tokenizer that makes sentence boundary
   * tokens their own tokens (e.g., {@link PTBTokenizer}).
   *
   * @param words A list of already tokenized words (must implement HasWord or be a String).
   * @return A list of sentences.
   * @see #WordToSentenceProcessor(String, String, Set, Set, String, NewlineIsSentenceBreak,
   *     SequencePattern, Set, boolean, boolean)
   */
  public List<List<IN>> wordsToSentences(List<? extends IN> words) {
    IdentityHashMap<Object, Boolean> isSentenceBoundary =
        null; // is null unless used by sentenceBoundaryMultiTokenPattern

    if (sentenceBoundaryMultiTokenPattern != null) {
      // Do initial pass using tokensregex to identify multi token patterns that need to be matched
      // and add the last token to our table of sentence boundary tokens
      isSentenceBoundary = new IdentityHashMap<>();
      SequenceMatcher<? super IN> matcher = sentenceBoundaryMultiTokenPattern.getMatcher(words);
      while (matcher.find()) {
        List nodes = matcher.groupNodes();
        if (nodes != null && !nodes.isEmpty()) {
          isSentenceBoundary.put(nodes.get(nodes.size() - 1), true);
        }
      }
    }

    // Split tokens into sentences!!!
    List<List<IN>> sentences = Generics.newArrayList();
    List<IN> currentSentence = new ArrayList<>();
    List<IN> lastSentence = null;
    boolean insideRegion = false;
    boolean inWaitForForcedEnd = false;
    boolean lastTokenWasNewline = false;

    for (IN o : words) {
      String word = getString(o);
      boolean forcedEnd = isForcedEndToken(o);

      boolean inMultiTokenExpr = false;
      boolean discardToken = false;
      if (o instanceof CoreMap) {
        // Hacky stuff to ensure sentence breaks do not happen in certain cases
        CoreMap cm = (CoreMap) o;
        Boolean forcedUntilEndValue =
            cm.get(CoreAnnotations.ForcedSentenceUntilEndAnnotation.class);
        if (!forcedEnd) {
          if (forcedUntilEndValue != null && forcedUntilEndValue) inWaitForForcedEnd = true;
          else {
            MultiTokenTag mt = cm.get(CoreAnnotations.MentionTokenAnnotation.class);
            if (mt != null && !mt.isEnd()) {
              // In the middle of a multi token mention, make sure sentence is not ended here
              inMultiTokenExpr = true;
            }
          }
        }
      }
      if (tokenPatternsToDiscard != null) {
        discardToken = matchesTokenPatternsToDiscard(word);
      }

      if (sentenceRegionBeginPattern != null && !insideRegion) {
        if (DEBUG) {
          log.info("Word is " + word + "; outside region; deleted");
        }
        if (sentenceRegionBeginPattern.matcher(word).matches()) {
          insideRegion = true;
          if (DEBUG) {
            log.info("  entering region");
          }
        }
        lastTokenWasNewline = false;
        continue;
      }

      if (lastSentence != null
          && currentSentence.isEmpty()
          && sentenceBoundaryFollowersPattern.matcher(word).matches()) {
        if (!discardToken) {
          lastSentence.add(o);
        }
        if (DEBUG) {
          log.info("Word is " + word + (discardToken ? "discarded" : "  added to last sentence"));
        }
        lastTokenWasNewline = false;
        continue;
      }

      boolean newSent = false;
      String debugText = (discardToken) ? "discarded" : "added to current";
      if (inWaitForForcedEnd && !forcedEnd) {
        if (!discardToken) currentSentence.add(o);
        if (DEBUG) {
          log.info("Word is " + word + "; is in wait for forced end; " + debugText);
        }
      } else if (inMultiTokenExpr && !forcedEnd) {
        if (!discardToken) currentSentence.add(o);
        if (DEBUG) {
          log.info("Word is " + word + "; is in multi token expr; " + debugText);
        }
      } else if (sentenceBoundaryToDiscard.contains(word)) {
        if (newlineIsSentenceBreak == NewlineIsSentenceBreak.ALWAYS) {
          newSent = true;
        } else if (newlineIsSentenceBreak == NewlineIsSentenceBreak.TWO_CONSECUTIVE) {
          if (lastTokenWasNewline) {
            newSent = true;
          }
        }
        lastTokenWasNewline = true;
        if (DEBUG) {
          log.info("Word is " + word + "  discarded sentence boundary");
        }
      } else {
        lastTokenWasNewline = false;
        Boolean isb;
        if (xmlBreakElementsToDiscard != null && matchesXmlBreakElementToDiscard(word)) {
          newSent = true;
          if (DEBUG) {
            log.info("Word is " + word + "; is XML break element; discarded");
          }
        } else if (sentenceRegionEndPattern != null
            && sentenceRegionEndPattern.matcher(word).matches()) {
          insideRegion = false;
          newSent = true;
          // Marked sentence boundaries
        } else if ((isSentenceBoundary != null)
            && ((isb = isSentenceBoundary.get(o)) != null)
            && isb) {
          if (!discardToken) currentSentence.add(o);
          if (DEBUG) {
            log.info(
                "Word is "
                    + word
                    + "; is sentence boundary (matched multi-token pattern); "
                    + debugText);
          }
          newSent = true;
        } else if (sentenceBoundaryTokenPattern.matcher(word).matches()) {
          if (!discardToken) currentSentence.add(o);
          if (DEBUG) {
            log.info("Word is " + word + "; is sentence boundary; " + debugText);
          }
          newSent = true;
        } else if (forcedEnd) {
          if (!discardToken) currentSentence.add(o);
          inWaitForForcedEnd = false;
          newSent = true;
          if (DEBUG) {
            log.info("Word is " + word + "; annotated to be the end of a sentence; " + debugText);
          }
        } else {
          if (!discardToken) currentSentence.add(o);
          if (DEBUG) {
            log.info("Word is " + word + "; " + debugText);
          }
        }
      }

      if (newSent && (!currentSentence.isEmpty() || allowEmptySentences)) {
        if (DEBUG) {
          log.info("  beginning new sentence");
        }
        sentences.add(currentSentence);
        // adds this sentence now that it's complete
        lastSentence = currentSentence;
        currentSentence = new ArrayList<>(); // clears the current sentence
      }
    }

    // add any words at the end, even if there isn't a sentence
    // terminator at the end of file
    if (!currentSentence.isEmpty()) {
      sentences.add(currentSentence); // adds last sentence
    }

    return sentences;
  }
Пример #15
0
  /**
   * Reads a list of Entries from a mapping file and update the given entries. Line numbers start
   * from 1.
   *
   * @return the updated list of Entries
   */
  private static List<Entry> readEntries(
      String annotatorName,
      List<Entry> entries,
      TrieMap<String, Entry> seenRegexes,
      String mappingFilename,
      BufferedReader mapping,
      Set<String> noDefaultOverwriteLabels,
      boolean ignoreCase,
      boolean verbose)
      throws IOException {
    int origEntriesSize = entries.size();
    int isTokensRegex = 0;
    int lineCount = 0;
    for (String line; (line = mapping.readLine()) != null; ) {
      lineCount++;
      String[] split = line.split("\t");
      if (split.length < 2 || split.length > 5) {
        throw new IllegalArgumentException(
            "Provided mapping file is in wrong format. This line is bad: " + line);
      }
      String regex = split[0].trim();
      String tokensRegex = null;
      String[] regexes = null;
      if (regex.startsWith("( ") && regex.endsWith(" )")) {
        // Tokens regex (remove start and end parenthesis)
        tokensRegex = regex.substring(1, regex.length() - 1).trim();
      } else {
        regexes = regex.split("\\s+");
      }
      String[] key = (regexes != null) ? regexes : new String[] {tokensRegex};
      if (ignoreCase) {
        String[] norm = new String[key.length];
        for (int i = 0; i < key.length; i++) {
          norm[i] = key[i].toLowerCase();
        }
        key = norm;
      }
      String type = split[1].trim();

      Set<String> overwritableTypes = Generics.newHashSet();
      double priority = 0.0;

      if (split.length >= 3) {
        overwritableTypes.addAll(Arrays.asList(split[2].trim().split("\\s*,\\s*")));
      }
      if (split.length >= 4) {
        try {
          priority = Double.parseDouble(split[3].trim());
        } catch (NumberFormatException e) {
          throw new IllegalArgumentException(
              "ERROR: Invalid priority in line "
                  + lineCount
                  + " in regexner file "
                  + mappingFilename
                  + ": \""
                  + line
                  + "\"!",
              e);
        }
      }
      int annotateGroup = 0;
      // Get annotate group from input....
      if (split.length >= 5) {
        // Which group to take (allow for context)
        String context = split[4].trim();
        try {
          annotateGroup = Integer.parseInt(context);
        } catch (NumberFormatException e) {
          throw new IllegalArgumentException(
              "ERROR: Invalid group in line "
                  + lineCount
                  + " in regexner file "
                  + mappingFilename
                  + ": \""
                  + line
                  + "\"!",
              e);
        }
      }

      // Print some warning about the type
      int commaPos = type.indexOf(',');
      if (commaPos > 0) {
        // Strip the "," and just take first type
        String newType = type.substring(0, commaPos).trim();
        logger.warn(
            "TokensRegexNERAnnotator "
                + annotatorName
                + ": Entry has multiple types: "
                + line
                + ".  Taking type to be "
                + newType);
        type = newType;
      }

      Entry entry =
          new Entry(tokensRegex, regexes, type, overwritableTypes, priority, annotateGroup);

      if (seenRegexes.containsKey(key)) {
        Entry oldEntry = seenRegexes.get(key);
        if (priority > oldEntry.priority) {
          logger.warn(
              "TokensRegexNERAnnotator "
                  + annotatorName
                  + ": Replace duplicate entry (higher priority): old="
                  + oldEntry
                  + ", new="
                  + entry);
        } else {
          if (!oldEntry.type.equals(type)) {
            if (verbose) {
              logger.warn(
                  "TokensRegexNERAnnotator "
                      + annotatorName
                      + ": Ignoring duplicate entry: "
                      + split[0]
                      + ", old type = "
                      + oldEntry.type
                      + ", new type = "
                      + type);
            }
            // } else {
            //   if (verbose) {
            //     logger.warn("TokensRegexNERAnnotator " + annotatorName +
            //             ": Duplicate entry [ignored]: " + split[0] + ", old type = " +
            // oldEntry.type + ", new type = " + type);
            //   }
          }
          continue;
        }
      }

      // Print some warning if label belongs to noDefaultOverwriteLabels but there is no
      // overwritable types
      if (entry.overwritableTypes.isEmpty() && noDefaultOverwriteLabels.contains(entry.type)) {
        logger.warn(
            "TokensRegexNERAnnotator "
                + annotatorName
                + ": Entry doesn't have overwriteable types "
                + entry
                + ", but entry type is in noDefaultOverwriteLabels");
      }

      entries.add(entry);
      seenRegexes.put(key, entry);
      if (entry.tokensRegex != null) isTokensRegex++;
    }

    logger.log(
        "TokensRegexNERAnnotator "
            + annotatorName
            + ": Read "
            + (entries.size() - origEntriesSize)
            + " unique entries out of "
            + lineCount
            + " from "
            + mappingFilename
            + ", "
            + isTokensRegex
            + " TokensRegex patterns.");
    return entries;
  }