示例#1
0
  public synchronized String processNative(BehemothDocument inputDoc, Reporter reporter) {
    if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString());
    // process the text passed as value with the application
    // a) create a GATE document based on the text value
    gate.Document gatedocument = null;
    try {

      gatedocument = generateGATEDoc(inputDoc);
      // add it to the current corpus
      corpus.add(gatedocument);
      // get the application and assign the corpus to it
      this.GATEapplication.setCorpus(corpus);
      // process it with GATE
      this.GATEapplication.execute();

      // transfer the annotations from the GATE document
      // to the Behemoth one using the filters
      if (reporter != null) reporter.incrCounter("GATE", "Document", 1);

      return gatedocument.toXml();

    } catch (Exception e) {
      LOG.error(inputDoc.getUrl().toString(), e);
      if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1);
    } finally {
      // remove the document from the corpus again
      corpus.clear();
      // and from memory
      if (gatedocument != null) Factory.deleteResource(gatedocument);
    }
    return null;
  }
  public static void main(String[] args) throws Exception {

    String inputFile = inputDirectory + "inputBulgarianSpecial-1.xml";
    String gateString = FileUtils.readFileToString(new File(inputFile), "UTF-8");

    log.error("Start Pipeline");
    Pipeline pl = new Pipeline();
    pl.initGate();
    Document doc = readDocument(gateString);

    /*
    log.error("Add Sentence Detection and Tokenization");
    GateOperations go = new GateOperations ();
    go.addExtraInfo(doc);
    writeFile(doc.toXml(),"inputEnglish-Proc-1.xml");
    */

    log.error("Add the Layout Annotation Sets");
    Layout layout = new Layout();
    layout.addLayout(doc);
    writeFile(doc.toXml(), "inputBulgarianSpecial-Final-1.xml");
  }
  /**
   * The main entry point. First we parse the command line options (see usage() method for details),
   * then we take all remaining command line parameters to be file names to process. Each file is
   * loaded, processed using the application and the results written to the output file
   * (inputFile.out.xml).
   */
  public static void main(String[] args) throws Exception {
    parseCommandLine(args);

    // initialise GATE - this must be done before calling any GATE APIs
    Gate.init();

    // load the saved application
    CorpusController application =
        (CorpusController) PersistenceManager.loadObjectFromFile(gappFile);

    // Create a Corpus to use.  We recycle the same Corpus object for each
    // iteration.  The string parameter to newCorpus() is simply the
    // GATE-internal name to use for the corpus.  It has no particular
    // significance.

    ArrayList<String> files = getFilesFromDir(inputDir);
    gate.Corpus corpus = createCorpus(files);
    // Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus");
    application.setCorpus(corpus);

    System.out.println("Processing " + files.size() + " files");

    // process the files one by one
    for (int i = 0; i < files.size(); i++) {

      // load the document (using the specified encoding if one was given)
      File docFile = new File(files.get(i));
      System.out.print("Processing document " + docFile + " (" + i + ") ...");
      Document doc = Factory.newDocument(docFile.toURL(), encoding);

      // put the document in the corpus
      corpus.add(doc);

      // run the application
      application.execute();

      // remove the document from the corpus again
      corpus.clear();

      String docXMLString = null;
      // if we want to just write out specific annotation types, we must
      // extract the annotations into a Set
      if (annotTypesToWrite != null) {
        // Create a temporary Set to hold the annotations we wish to write out
        Set annotationsToWrite = new HashSet();

        // we only extract annotations from the default (unnamed) AnnotationSet
        // in this example
        AnnotationSet defaultAnnots = doc.getAnnotations("Output");
        Iterator annotTypesIt = annotTypesToWrite.iterator();
        while (annotTypesIt.hasNext()) {
          // extract all the annotations of each requested type and add them to
          // the temporary set
          AnnotationSet annotsOfThisType = defaultAnnots.get((String) annotTypesIt.next());
          if (annotsOfThisType != null) {
            annotationsToWrite.addAll(annotsOfThisType);
          }
        }

        // create the XML string using these annotations
        docXMLString = doc.toXml(annotationsToWrite, true);
      }
      // otherwise, just write out the whole document as GateXML
      else {
        docXMLString = doc.toXml();
      }

      // Release the document, as it is no longer needed
      Factory.deleteResource(doc);

      // output the XML to <inputFile>.out.xml
      System.out.println("Writing file " + docFile.getName());
      String outputFileName = docFile.getName() + ".out.xml";
      // File outputFile = new File(docFile.getParentFile(), outputFileName);
      File outputFile = new File(new File(outputDir).getAbsolutePath(), outputFileName);

      // Write output files using the same encoding as the original
      FileOutputStream fos = new FileOutputStream(outputFile);
      BufferedOutputStream bos = new BufferedOutputStream(fos);
      OutputStreamWriter out;
      if (encoding == null) {
        out = new OutputStreamWriter(bos);
      } else {
        out = new OutputStreamWriter(bos, encoding);
      }

      out.write(docXMLString);

      out.close();
      System.out.println("done");
    } // for each file

    System.out.println("All done");
  } // void main(String[] args)
  /**
   * Run from the command-line, with a list of URLs as argument.
   *
   * <p><B>NOTE:</B><br>
   * This code will run with all the documents in memory - if you want to unload each from memory
   * after use, add code to store the corpus in a DataStore.
   */
  public static void main(String args[]) throws GateException, IOException {
    // initialise the GATE library
    Out.prln("Initialising GATE...");
    Gate.init();
    Out.prln("...GATE initialised");

    // initialise ANNIE (this may take several minutes)
    StandAloneAnnie annie = new StandAloneAnnie();
    annie.initAnnie();

    // create a GATE corpus and add a document for each command-line
    // argument
    Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus");
    for (int i = 0; i < args.length; i++) {
      URL u = new URL(args[i]);
      FeatureMap params = Factory.newFeatureMap();
      params.put("sourceUrl", u);
      params.put("preserveOriginalContent", new Boolean(true));
      params.put("collectRepositioningInfo", new Boolean(true));
      Out.prln("Creating doc for " + u);
      Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params);
      corpus.add(doc);
    } // for each of args

    // tell the pipeline about the corpus and run it
    annie.setCorpus(corpus);
    annie.execute();

    // for each document, get an XML document with the
    // person and location names added
    Iterator iter = corpus.iterator();
    int count = 0;
    String startTagPart_1 = "<span GateID=\"";
    String startTagPart_2 = "\" title=\"";
    String startTagPart_3 = "\" style=\"background:Red;\">";
    String endTag = "</span>";

    while (iter.hasNext()) {
      Document doc = (Document) iter.next();
      AnnotationSet defaultAnnotSet = doc.getAnnotations();
      Set annotTypesRequired = new HashSet();
      annotTypesRequired.add("Person");
      annotTypesRequired.add("Location");
      Set<Annotation> peopleAndPlaces =
          new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired));

      FeatureMap features = doc.getFeatures();
      String originalContent =
          (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
      RepositioningInfo info =
          (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);

      ++count;
      File file = new File("StANNIE_" + count + ".HTML");
      Out.prln("File name: '" + file.getAbsolutePath() + "'");
      if (originalContent != null && info != null) {
        Out.prln("OrigContent and reposInfo existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionStart = info.getOriginalPos(insertPositionStart);
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          insertPositionEnd = info.getOriginalPos(insertPositionEnd, true);
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } // if - should generate
      else if (originalContent != null) {
        Out.prln("OrigContent existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } else {
        Out.prln("Content : " + originalContent);
        Out.prln("Repositioning: " + info);
      }

      String xmlDocument = doc.toXml(peopleAndPlaces, false);
      String fileName = new String("StANNIE_toXML_" + count + ".HTML");
      FileWriter writer = new FileWriter(fileName);
      writer.write(xmlDocument);
      writer.close();
    } // for each doc
  } // main