Beispiel #1
0
 public ArrayList<String> getNamedEntity(String sentence) {
   ArrayList<String> entities = new ArrayList<>();
   List<Triple<String, Integer, Integer>> name2index =
       classifier.classifyToCharacterOffsets(sentence);
   for (Triple<String, Integer, Integer> name : name2index) {
     entities.add(sentence.substring(name.second(), name.third()));
   }
   return entities;
 }
 public Map<String, NamedEntity> findNamedEntities(String sentence) {
   final Map<String, NamedEntity> namedEntities = new HashMap<>();
   final List<Triple<String, Integer, Integer>> nerSubstrings = findNerSubstrings(sentence);
   for (final Triple<String, Integer, Integer> substring : nerSubstrings) {
     namedEntities.put(
         sentence.substring(substring.second(), substring.third()),
         NamedEntity.getNamedEntity(substring.first()));
   }
   return namedEntities;
 }
 public String getPrecisionDescription(int numDigits, L label) {
   NumberFormat nf = NumberFormat.getNumberInstance();
   nf.setMaximumFractionDigits(numDigits);
   Triple<Double, Integer, Integer> prec = getPrecisionInfo(label);
   return nf.format(prec.first())
       + "  ("
       + prec.second()
       + "/"
       + (prec.second() + prec.third())
       + ")";
 }
 public String getRecallDescription(int numDigits, L label) {
   NumberFormat nf = NumberFormat.getNumberInstance();
   nf.setMaximumFractionDigits(numDigits);
   Triple<Double, Integer, Integer> recall = getRecallInfo(label);
   return nf.format(recall.first())
       + "  ("
       + recall.second()
       + "/"
       + (recall.second() + recall.third())
       + ")";
 }
 /** Returns a String summarizing overall accuracy that will print nicely. */
 public String getAccuracyDescription(int numDigits) {
   NumberFormat nf = NumberFormat.getNumberInstance();
   nf.setMaximumFractionDigits(numDigits);
   Triple<Double, Integer, Integer> accu = getAccuracyInfo();
   return nf.format(accu.first())
       + "  ("
       + accu.second()
       + "/"
       + (accu.second() + accu.third())
       + ")";
 }
Beispiel #6
0
  public static Set<String> analyzeThisFile(String path) {

    String serializedClassifier = "classifiers/english.conll.4class.distsim.crf.ser.gz";

    Set<String> annotationsNE = new HashSet<String>();
    try {
      BufferedReader br = new BufferedReader(new FileReader(path));
      AbstractSequenceClassifier<CoreLabel> classifier =
          CRFClassifier.getClassifier(serializedClassifier);
      String line = "";
      while ((line = br.readLine()) != null) {
        List<Triple<String, Integer, Integer>> triples =
            classifier.classifyToCharacterOffsets(line);
        for (Triple<String, Integer, Integer> trip : triples) {
          annotationsNE.add(line.substring(trip.second(), trip.third()));
        }
      }

      System.out.println("Named Entity TROVATE");
      for (String ne : annotationsNE) System.out.println(ne);

    } catch (FileNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (ClassCastException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (ClassNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    return annotationsNE;
  }
  private String getConllEvalString(List<L> orderedLabels, boolean ignoreNegLabel) {
    StringBuilder sb = new StringBuilder();
    int correctPhrases = getCorrect() - getCorrect(negLabel);
    Triple<Double, Integer, Integer> accuracyInfo = getAccuracyInfo();
    int totalCount = accuracyInfo.second() + accuracyInfo.third();
    sb.append("processed " + totalCount + " tokens with " + getRelevant() + " phrases; ");
    sb.append("found: " + getRetrieved() + " phrases; correct: " + correctPhrases + "\n");

    Formatter formatter = new Formatter(sb, Locale.US);
    formatter.format("accuracy: %6.2f%%; ", accuracyInfo.first() * 100);
    formatter.format("precision: %6.2f%%; ", getPrecision() * 100);
    formatter.format("recall: %6.2f%%; ", getRecall() * 100);
    formatter.format("FB1: %6.2f\n", getFMeasure() * 100);
    for (L label : orderedLabels) {
      if (ignoreNegLabel && label.equals(negLabel)) {
        continue;
      }
      formatter.format("%17s: ", label);
      formatter.format("precision: %6.2f%%; ", getPrecision(label) * 100);
      formatter.format("recall: %6.2f%%; ", getRecall(label) * 100);
      formatter.format("FB1: %6.2f  %d\n", getFMeasure(label) * 100, getRetrieved(label));
    }
    return sb.toString();
  }
Beispiel #8
0
  public static void main(String[] args) throws Exception {

    // String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz";
    String serializedClassifier = "classifiers/english.muc.7class.distsim.crf.ser.gz";
    if (args.length > 0) {
      serializedClassifier = args[0];
    }

    AbstractSequenceClassifier<CoreLabel> classifier =
        CRFClassifier.getClassifier(serializedClassifier);

    /* For either a file to annotate or for the hardcoded text example, this
       demo file shows several ways to process the input, for teaching purposes.
    */

    if (args.length > 1) {

      /* For the file, it shows (1) how to run NER on a String, (2) how
         to get the entities in the String with character offsets, and
         (3) how to run NER on a whole file (without loading it into a String).
      */

      String fileContents = IOUtils.slurpFile(args[1]);
      List<List<CoreLabel>> out = classifier.classify(fileContents);
      for (List<CoreLabel> sentence : out) {
        for (CoreLabel word : sentence) {
          System.out.print(
              word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
        }
        System.out.println();
      }

      System.out.println("---");
      out = classifier.classifyFile(args[1]);
      for (List<CoreLabel> sentence : out) {
        for (CoreLabel word : sentence) {
          System.out.print(
              word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
        }
        System.out.println();
      }

      System.out.println("---");

      List<Triple<String, Integer, Integer>> list =
          classifier.classifyToCharacterOffsets(fileContents);
      for (Triple<String, Integer, Integer> item : list) {
        // print entity/or non-entity - their nearby tokens
        System.out.println(
            item.first() + ": " + fileContents.substring(item.second(), item.third()));
      }
      System.out.println("---");
      System.out.println("Ten best entity labelings");
      DocumentReaderAndWriter<CoreLabel> readerAndWriter =
          classifier.makePlainTextReaderAndWriter();
      classifier.classifyAndWriteAnswersKBest(args[1], 10, readerAndWriter);

      System.out.println("---");
      System.out.println("Per-token marginalized probabilities");
      classifier.printProbs(args[1], readerAndWriter);

      // -- This code prints out the first order (token pair) clique probabilities.
      // -- But that output is a bit overwhelming, so we leave it commented out by default.
      // System.out.println("---");
      // System.out.println("First Order Clique Probabilities");
      // ((CRFClassifier) classifier).printFirstOrderProbs(args[1], readerAndWriter);

    } else {

      /* For the hard-coded String, it shows how to run it on a single
         sentence, and how to do this and produce several formats, including
         slash tags and an inline XML output format. It also shows the full
         contents of the {@code CoreLabel}s that are constructed by the
         classifier. And it shows getting out the probabilities of different
         assignments and an n-best list of classifications with probabilities.
      */

      String[] example = {
        "Good afternoon Rajat Raina, how are you today? I go to Washington DC on September 19. And Tomorrow.",
        "I go to school at Stanford University, which is located in California."
      };
      for (String str : example) {
        System.out.println(classifier.classifyToString(str));
      }
      System.out.println("---");

      // ***sentence-by-sentence
      for (String str : example) {
        // This one puts in spaces and newlines between tokens, so just print not println.
        System.out.print(classifier.classifyToString(str, "slashTags", false));
      }
      System.out.println("---");

      // ***print: entities + Classes + remaining text in the text
      for (String str : example) {
        // This one is best for dealing with the output as a TSV (tab-separated column) file.
        // The first column gives entities, the second their classes, and the third the remaining
        // text in a document
        System.out.print(classifier.classifyToString(str, "tabbedEntities", false));
      }
      System.out.println("---");

      for (String str : example) {
        System.out.println(classifier.classifyWithInlineXML(str));
      }
      System.out.println("---");

      for (String str : example) {
        System.out.println(classifier.classifyToString(str, "xml", true));
      }
      System.out.println("---");

      for (String str : example) {
        System.out.print(classifier.classifyToString(str, "tsv", false));
      }
      System.out.println("---");

      // This gets out entities with character offsets
      System.out.print("character offsets");
      int j = 0;
      for (String str : example) {
        j++;
        List<Triple<String, Integer, Integer>> triples = classifier.classifyToCharacterOffsets(str);
        for (Triple<String, Integer, Integer> trip : triples) {
          System.out.printf(
              "%s over character offsets [%d, %d) in sentence %d.%n",
              trip.first(), trip.second(), trip.third, j);
        }
      }
      System.out.println("---");

      // This prints out all the details of what is stored for each token
      int i = 0;
      for (String str : example) {
        for (List<CoreLabel> lcl : classifier.classify(str)) {
          for (CoreLabel cl : lcl) {
            System.out.print(i++ + ": ");
            System.out.println(cl.toShorterString());
          }
        }
      }

      System.out.println("---");
    }
  }
 public void train(Collection<Pair<Document, List<Entity>>> trainingData) {
   startTrack("Training");
   // --Variables
   RVFDataset<Boolean, Feature> dataset = new RVFDataset<Boolean, Feature>();
   LinearClassifierFactory<Boolean, Feature> fact =
       new LinearClassifierFactory<Boolean, Feature>();
   // --Feature Extraction
   startTrack("Feature Extraction");
   for (Pair<Document, List<Entity>> datum : trainingData) {
     // (document variables)
     Document doc = datum.getFirst();
     List<Entity> goldClusters = datum.getSecond();
     List<Mention> mentions = doc.getMentions();
     Map<Mention, Entity> goldEntities = Entity.mentionToEntityMap(goldClusters);
     startTrack("Document " + doc.id);
     // (for each mention...)
     for (int i = 0; i < mentions.size(); i++) {
       // (get the mention and its cluster)
       Mention onPrix = mentions.get(i);
       Entity source = goldEntities.get(onPrix);
       if (source == null) {
         throw new IllegalArgumentException("Mention has no gold entity: " + onPrix);
       }
       // (for each previous mention...)
       int oldSize = dataset.size();
       for (int j = i - 1; j >= 0; j--) {
         // (get previous mention and its cluster)
         Mention cand = mentions.get(j);
         Entity target = goldEntities.get(cand);
         if (target == null) {
           throw new IllegalArgumentException("Mention has no gold entity: " + cand);
         }
         // (extract features)
         Counter<Feature> feats =
             extractor.extractFeatures(Pair.make(onPrix, cand.markCoreferent(target)));
         // (add datum)
         dataset.add(new RVFDatum<Boolean, Feature>(feats, target == source));
         // (stop if
         if (target == source) {
           break;
         }
       }
       // logf("Mention %s (%d datums)", onPrix.toString(), dataset.size() - oldSize);
     }
     endTrack("Document " + doc.id);
   }
   endTrack("Feature Extraction");
   // --Train Classifier
   startTrack("Minimizer");
   this.classifier = fact.trainClassifier(dataset);
   endTrack("Minimizer");
   // --Dump Weights
   startTrack("Features");
   // (get labels to print)
   Set<Boolean> labels = new HashSet<Boolean>();
   labels.add(true);
   // (print features)
   for (Triple<Feature, Boolean, Double> featureInfo :
       this.classifier.getTopFeatures(labels, 0.0, true, 100, true)) {
     Feature feature = featureInfo.first();
     Boolean label = featureInfo.second();
     Double magnitude = featureInfo.third();
     // log(FORCE,new DecimalFormat("0.000").format(magnitude) + " [" + label + "] " + feature);
   }
   end_Track("Features");
   endTrack("Training");
 }