public static void main(String[] args) throws IOException {

    String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz";

    if (args.length > 0) {
      serializedClassifier = args[0];
    }

    AbstractSequenceClassifier<CoreLabel> classifier =
        CRFClassifier.getClassifierNoExceptions(serializedClassifier);

    /* For either a file to annotate or for the hardcoded text example,
       this demo file shows two ways to process the output, for teaching
       purposes.  For the file, it shows both how to run NER on a String
       and how to run it on a whole file.  For the hard-coded String,
       it shows how to run it on a single sentence, and how to do this
       and produce an inline XML output format.
    */
    if (args.length > 1) {
      String fileContents = IOUtils.slurpFile(args[1]);
      List<List<CoreLabel>> out = classifier.classify(fileContents);
      for (List<CoreLabel> sentence : out) {
        for (CoreLabel word : sentence) {
          System.out.print(word.word() + '/' + word.get(AnswerAnnotation.class) + ' ');
        }
        System.out.println();
      }
      out = classifier.classifyFile(args[1]);
      for (List<CoreLabel> sentence : out) {
        for (CoreLabel word : sentence) {
          System.out.print(word.word() + '/' + word.get(AnswerAnnotation.class) + ' ');
        }
        System.out.println();
      }

    } else {
      String s1 = "Good afternoon Rajat Raina, how are you today?";
      String s2 = "I go to school at Stanford University, which is located in California.";
      System.out.println(classifier.classifyToString(s1));
      System.out.println(classifier.classifyWithInlineXML(s2));
      System.out.println(classifier.classifyToString(s2, "xml", true));
      int i = 0;
      for (List<CoreLabel> lcl : classifier.classify(s2)) {
        for (CoreLabel cl : lcl) {
          System.out.println(i++ + ":");
          System.out.println(cl);
        }
      }
    }
  }
  // Method to process the named entities
  public void processNamedEntity() {
    String serializedClassifier =
        "classifiers/english.all.3class.distsim.crf.ser.gz"; // 3 class (PERSON, LOCATION and
                                                             // ORGANISATION) classifier
    // String serializedClassifier = "classifiers/english.conll.4class.distsim.crf.ser.gz";
    AbstractSequenceClassifier<CoreLabel> classifier =
        CRFClassifier.getClassifierNoExceptions(serializedClassifier);
    FileHandler fw = new FileHandler();
    org.apache.log4j.PropertyConfigurator.configure(fw.filenameLog4JConfig);
    Path path = Paths.get(fw.filename); // path for input file

    try (BufferedReader reader = Files.newBufferedReader(path, Charset.defaultCharset())) {
      fw.setFile(fw.fileResultsWithNER); // delete contents to file before starting the process
      String line = null;
      while ((line = reader.readLine()) != null) {
        String classifiedData = classifier.classifyWithInlineXML(line);
        fw.writeToFile("", classifiedData, fw.fileResultsWithNER, true);
      }
    } catch (IOException e) {
      e.printStackTrace();
    }
  }
Example #3
0
  public static Set<String> analyzeThisFile(String path) {

    String serializedClassifier = "classifiers/english.conll.4class.distsim.crf.ser.gz";

    Set<String> annotationsNE = new HashSet<String>();
    try {
      BufferedReader br = new BufferedReader(new FileReader(path));
      AbstractSequenceClassifier<CoreLabel> classifier =
          CRFClassifier.getClassifier(serializedClassifier);
      String line = "";
      while ((line = br.readLine()) != null) {
        List<Triple<String, Integer, Integer>> triples =
            classifier.classifyToCharacterOffsets(line);
        for (Triple<String, Integer, Integer> trip : triples) {
          annotationsNE.add(line.substring(trip.second(), trip.third()));
        }
      }

      System.out.println("Named Entity TROVATE");
      for (String ne : annotationsNE) System.out.println(ne);

    } catch (FileNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (ClassCastException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (ClassNotFoundException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    } catch (IOException e) {
      // TODO Auto-generated catch block
      e.printStackTrace();
    }

    return annotationsNE;
  }
Example #4
0
  public static void main(String[] args) throws Exception {

    // String serializedClassifier = "classifiers/english.all.3class.distsim.crf.ser.gz";
    String serializedClassifier = "classifiers/english.muc.7class.distsim.crf.ser.gz";
    if (args.length > 0) {
      serializedClassifier = args[0];
    }

    AbstractSequenceClassifier<CoreLabel> classifier =
        CRFClassifier.getClassifier(serializedClassifier);

    /* For either a file to annotate or for the hardcoded text example, this
       demo file shows several ways to process the input, for teaching purposes.
    */

    if (args.length > 1) {

      /* For the file, it shows (1) how to run NER on a String, (2) how
         to get the entities in the String with character offsets, and
         (3) how to run NER on a whole file (without loading it into a String).
      */

      String fileContents = IOUtils.slurpFile(args[1]);
      List<List<CoreLabel>> out = classifier.classify(fileContents);
      for (List<CoreLabel> sentence : out) {
        for (CoreLabel word : sentence) {
          System.out.print(
              word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
        }
        System.out.println();
      }

      System.out.println("---");
      out = classifier.classifyFile(args[1]);
      for (List<CoreLabel> sentence : out) {
        for (CoreLabel word : sentence) {
          System.out.print(
              word.word() + '/' + word.get(CoreAnnotations.AnswerAnnotation.class) + ' ');
        }
        System.out.println();
      }

      System.out.println("---");

      List<Triple<String, Integer, Integer>> list =
          classifier.classifyToCharacterOffsets(fileContents);
      for (Triple<String, Integer, Integer> item : list) {
        // print entity/or non-entity - their nearby tokens
        System.out.println(
            item.first() + ": " + fileContents.substring(item.second(), item.third()));
      }
      System.out.println("---");
      System.out.println("Ten best entity labelings");
      DocumentReaderAndWriter<CoreLabel> readerAndWriter =
          classifier.makePlainTextReaderAndWriter();
      classifier.classifyAndWriteAnswersKBest(args[1], 10, readerAndWriter);

      System.out.println("---");
      System.out.println("Per-token marginalized probabilities");
      classifier.printProbs(args[1], readerAndWriter);

      // -- This code prints out the first order (token pair) clique probabilities.
      // -- But that output is a bit overwhelming, so we leave it commented out by default.
      // System.out.println("---");
      // System.out.println("First Order Clique Probabilities");
      // ((CRFClassifier) classifier).printFirstOrderProbs(args[1], readerAndWriter);

    } else {

      /* For the hard-coded String, it shows how to run it on a single
         sentence, and how to do this and produce several formats, including
         slash tags and an inline XML output format. It also shows the full
         contents of the {@code CoreLabel}s that are constructed by the
         classifier. And it shows getting out the probabilities of different
         assignments and an n-best list of classifications with probabilities.
      */

      String[] example = {
        "Good afternoon Rajat Raina, how are you today? I go to Washington DC on September 19. And Tomorrow.",
        "I go to school at Stanford University, which is located in California."
      };
      for (String str : example) {
        System.out.println(classifier.classifyToString(str));
      }
      System.out.println("---");

      // ***sentence-by-sentence
      for (String str : example) {
        // This one puts in spaces and newlines between tokens, so just print not println.
        System.out.print(classifier.classifyToString(str, "slashTags", false));
      }
      System.out.println("---");

      // ***print: entities + Classes + remaining text in the text
      for (String str : example) {
        // This one is best for dealing with the output as a TSV (tab-separated column) file.
        // The first column gives entities, the second their classes, and the third the remaining
        // text in a document
        System.out.print(classifier.classifyToString(str, "tabbedEntities", false));
      }
      System.out.println("---");

      for (String str : example) {
        System.out.println(classifier.classifyWithInlineXML(str));
      }
      System.out.println("---");

      for (String str : example) {
        System.out.println(classifier.classifyToString(str, "xml", true));
      }
      System.out.println("---");

      for (String str : example) {
        System.out.print(classifier.classifyToString(str, "tsv", false));
      }
      System.out.println("---");

      // This gets out entities with character offsets
      System.out.print("character offsets");
      int j = 0;
      for (String str : example) {
        j++;
        List<Triple<String, Integer, Integer>> triples = classifier.classifyToCharacterOffsets(str);
        for (Triple<String, Integer, Integer> trip : triples) {
          System.out.printf(
              "%s over character offsets [%d, %d) in sentence %d.%n",
              trip.first(), trip.second(), trip.third, j);
        }
      }
      System.out.println("---");

      // This prints out all the details of what is stored for each token
      int i = 0;
      for (String str : example) {
        for (List<CoreLabel> lcl : classifier.classify(str)) {
          for (CoreLabel cl : lcl) {
            System.out.print(i++ + ": ");
            System.out.println(cl.toShorterString());
          }
        }
      }

      System.out.println("---");
    }
  }