예제 #1
0
  public synchronized String processNative(BehemothDocument inputDoc, Reporter reporter) {
    if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString());
    // process the text passed as value with the application
    // a) create a GATE document based on the text value
    gate.Document gatedocument = null;
    try {

      gatedocument = generateGATEDoc(inputDoc);
      // add it to the current corpus
      corpus.add(gatedocument);
      // get the application and assign the corpus to it
      this.GATEapplication.setCorpus(corpus);
      // process it with GATE
      this.GATEapplication.execute();

      // transfer the annotations from the GATE document
      // to the Behemoth one using the filters
      if (reporter != null) reporter.incrCounter("GATE", "Document", 1);

      return gatedocument.toXml();

    } catch (Exception e) {
      LOG.error(inputDoc.getUrl().toString(), e);
      if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1);
    } finally {
      // remove the document from the corpus again
      corpus.clear();
      // and from memory
      if (gatedocument != null) Factory.deleteResource(gatedocument);
    }
    return null;
  }
 @SuppressWarnings("unchecked")
 public Set<String> processDoc(String str) throws Exception {
   Set<String> toReturn = new HashSet<String>();
   Corpus c = null;
   Document aDoc = null;
   try {
     c = Factory.newCorpus("sample");
     aDoc = Factory.newDocument(str);
     c.add(aDoc);
     controller.setCorpus(c);
     controller.execute();
     AnnotationSet aSet = aDoc.getAnnotations("StockSymbols");
     for (Annotation annot : aSet) {
       String symbol = (String) annot.getFeatures().get("sym");
       toReturn.add(symbol);
     }
   } catch (Exception e) {
     throw e;
   } finally {
     if (aDoc != null) {
       Factory.deleteResource(aDoc);
     }
     if (c != null) {
       Factory.deleteResource(c);
     }
   }
   return toReturn;
 }
예제 #3
0
  public JSONObject persian_sentiment(String text) throws Exception {

    oncreate();

    File PersianGapp = new File("C:/Users/mohammad/Desktop/New folder/Gate/application.xgapp");
    // initialise GATE - this must be done before calling any GATE APIs
    Gate.init();

    // load the saved application

    CorpusController application =
        (CorpusController) PersistenceManager.loadObjectFromFile(PersianGapp);

    // Create a Corpus to use.  We recycle the same Corpus object for each
    // iteration.  The string parameter to newCorpus() is simply the
    // GATE-internal name to use for the corpus.  It has no particular
    // significance.
    Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus");
    application.setCorpus(corpus);

    // process the files one by one

    // load the document (using the specified encoding if one was given)

    Document doc = Factory.newDocument(text);

    // put the document in the corpus
    corpus.add(doc);

    // run the application
    application.execute();

    String featureName = "Doc_sentiment";
    FeatureMap features = doc.getFeatures();
    // remove the document from the corpus again
    corpus.clear();

    // doc.getFeatures().
    // Release the document, as it is no longer needed
    Factory.deleteResource(doc);

    LinkedHashMap originalContent = (LinkedHashMap) features.get(featureName);

    String obj = (String) originalContent.get("sentiment");
    // BigDecimal pos =(BigDecimal) originalContent.get("positive");
    // BigDecimal neg =(BigDecimal) originalContent.get("negative");
    // System.out.println(obj);
    // create Json for response to user
    JSONObject obj1 = new JSONObject();
    obj1.put("sentiment", obj);
    /*obj1.put("positive",pos);
    //obj1.put("negative",neg);
    System.out.print("----------");
    System.out.print(obj1);
    System.out.print("----------");*/
    // application.cleanup();
    return obj1;
  }
예제 #4
0
 /**
  * Call the closure once for each document in this corpus, loading and unloading documents as
  * appropriate in the case of a persistent corpus, and adding the return values of each call to
  * the given collection.
  *
  * @param self the corpus to traverse
  * @param closure the closure to call
  * @return a list of the return values from each closure call.
  */
 public static <T> Collection<T> collect(Corpus self, Collection<T> coll, Closure<T> closure) {
   for (int i = 0; i < self.size(); i++) {
     boolean docWasLoaded = self.isDocumentLoaded(i);
     Document doc = self.get(i);
     coll.add(closure.call(doc));
     if (!docWasLoaded) {
       self.unloadDocument(doc);
       Factory.deleteResource(doc);
     }
   }
   return coll;
 }
예제 #5
0
 /**
  * Call the closure once for each document in this corpus, loading and unloading documents as
  * appropriate in the case of a persistent corpus.
  *
  * @param self the corpus to traverse
  * @param closure the closure to call
  * @return the corpus.
  */
 public static <T> Object each(Corpus self, Closure<T> closure) {
   for (int i = 0; i < self.size(); i++) {
     boolean docWasLoaded = self.isDocumentLoaded(i);
     Document doc = self.get(i);
     closure.call(doc);
     if (!docWasLoaded) {
       self.unloadDocument(doc);
       Factory.deleteResource(doc);
     }
   }
   return self;
 }
  private static gate.Corpus createCorpus(ArrayList<String> files) throws GateException {

    gate.Corpus corpus = Factory.newCorpus("Transient Gate Corpus");
    for (String file : files) {
      System.out.print("\t " + file);
      try {
        corpus.add(Factory.newDocument(new File(file).toURL()));
        System.out.println(" -- success");
      } catch (gate.creole.ResourceInstantiationException e) {
        System.out.println(" -- failed (" + e.getMessage() + ")");
      } catch (Exception e) {
        System.out.println(" -- " + e.getMessage());
      }
    }
    return corpus;
  }
예제 #7
0
 /** Clear up the resources used after one test. */
 private void clearOneTest() {
   corpus.clear();
   Factory.deleteResource(corpus);
   Factory.deleteResource(learningApi);
   controller.remove(learningApi);
   controller.cleanup();
   Factory.deleteResource(controller);
 }
예제 #8
0
  /**
   * Constructor to create a SerialCorpus from a transient one. This is called by adopt() to store
   * the transient corpus and re-route the methods calls to it, until the corpus is sync-ed on disk.
   * After that, the transientCorpus will always be null, so the new functionality will be used
   * instead.
   */
  protected SerialCorpusImpl(Corpus tCorpus) {
    // copy the corpus name and features from the one in memory
    this.setName(tCorpus.getName());
    this.setFeatures(tCorpus.getFeatures());

    docDataList = new ArrayList<DocumentData>();
    // now cache the names of all docs for future use
    List<String> docNames = tCorpus.getDocumentNames();
    for (int i = 0; i < docNames.size(); i++) {
      Document doc = tCorpus.get(i);
      docDataList.add(new DocumentData(docNames.get(i), null, doc.getClass().getName()));
    }

    // copy all the documents from the transient corpus
    documents = new ArrayList<Document>();
    documents.addAll(tCorpus);

    // make sure we fire events when docs are added/removed/etc
    Gate.getCreoleRegister().addCreoleListener(this);
  }
예제 #9
0
  public void setTransientSource(Object source) {
    if (!(source instanceof Corpus)) return;

    // the following initialisation is only valid when we're
    // constructing
    // this object from a transient one. If it has already been stored
    // in
    // a datastore, then the initialisation is done in readObject()
    // since
    // this method is the one called by serialisation, when objects
    // are restored.
    if (this.dataStore != null && this.lrPersistentId != null) return;

    Corpus tCorpus = (Corpus) source;

    // copy the corpus name and features from the one in memory
    this.setName(tCorpus.getName());
    this.setFeatures(tCorpus.getFeatures());

    docDataList = new ArrayList<DocumentData>();
    // now cache the names of all docs for future use
    List<String> docNames = tCorpus.getDocumentNames();
    for (int i = 0; i < docNames.size(); i++) {
      Document aDoc = tCorpus.get(i);
      docDataList.add(new DocumentData(docNames.get(i), null, aDoc.getClass().getName()));
    }

    // copy all the documents from the transient corpus
    documents = new ArrayList<Document>();
    documents.addAll(tCorpus);

    this.addedDocs = new Vector<Document>();
    this.removedDocIDs = new Vector<String>();
    this.changedDocs = new Vector<Document>();

    // make sure we fire events when docs are added/removed/etc
    Gate.getCreoleRegister().addCreoleListener(this);
  }
  public static void main(String[] args) throws Exception {
    // Logger.getLogger(DocumentFeaturesDiff.class).setLevel(Level.ALL);

    GateUtils.initGateKeepLog();
    GateUtils.registerCzsemPlugin();

    ProcessingResource eval =
        new PRSetup.SinglePRSetup(LearningEvaluator.class)
            .putFeature("keyASName", ":-)")
            //				.putFeature("responseASName", "lemma_flex")
            .putFeature("responseASName", "flex")
            .putFeature("keyAnnotationsAreInDocumentFeatures", true)
            .putFeatureList("annotationTypes", "Lookup")
            .putFeatureList("featureNames", "meshID")
            .createPR();

    SerialAnalyserController controller =
        (SerialAnalyserController)
            Factory.createResource(SerialAnalyserController.class.getCanonicalName());

    controller.add(eval);

    Corpus corpus = Factory.newCorpus(null);
    corpus.populate(
        new File("C:\\Users\\dedek\\Desktop\\bmc\\experiment\\analyzed").toURI().toURL(),
        //				new File("C:\\Users\\dedek\\Desktop\\bmca_devel").toURI().toURL(),
        null,
        "utf8",
        false);

    System.err.println("populated");

    controller.setCorpus(corpus);

    controller.execute();
  }
예제 #11
0
 /**
  * Loading the configurationg file and corpus for testing. And make settings as in the GATE Gui.
  */
 void loadSettings(String configFileName, String corpusDirName, String inputasN, String outputasN)
     throws GateException, IOException {
   LogService.minVerbosityLevel = 0;
   if (LogService.minVerbosityLevel > 0)
     System.out.println("Learning Home : " + learningHome.getAbsolutePath());
   FeatureMap parameters = Factory.newFeatureMap();
   URL configFileURL = new File(configFileName).toURI().toURL();
   parameters.put("configFileURL", configFileURL);
   learningApi =
       (LearningAPIMain) Factory.createResource("gate.learning.LearningAPIMain", parameters);
   // Load the corpus
   corpus = Factory.newCorpus("DataSet");
   ExtensionFileFilter fileFilter = new ExtensionFileFilter();
   fileFilter.addExtension("xml");
   File[] xmlFiles = new File(corpusDirName).listFiles(fileFilter);
   Arrays.sort(
       xmlFiles,
       new Comparator<File>() {
         public int compare(File a, File b) {
           return a.getName().compareTo(b.getName());
         }
       });
   for (File f : xmlFiles) {
     if (!f.isDirectory()) {
       Document doc = Factory.newDocument(f.toURI().toURL(), "UTF-8");
       doc.setName(f.getName());
       corpus.add(doc);
     }
   }
   //    URL tempURL = new File(corpusDirName).toURI().toURL();
   //    corpus.populate(tempURL, fileFilter, "UTF-8", false);
   // Set the inputAS
   learningApi.setInputASName(inputasN);
   learningApi.setOutputASName(outputasN);
   controller =
       (gate.creole.SerialAnalyserController)
           Factory.createResource("gate.creole.SerialAnalyserController");
   controller.setCorpus(corpus);
   controller.add(learningApi);
 }
예제 #12
0
  /**
   * @param args
   * @throws Exception
   */
  public static void main(String[] args) throws Exception {
    /* Parse command line arguments */
    Getopt g = new Getopt("gateExtractor", args, "i:r:te");
    g.setOpterr(false);

    String inputPath = "";
    String outputPath = "";

    boolean train = false;
    boolean eval = false;
    boolean run = false;

    int c;
    String arg;
    while ((c = g.getopt()) != -1) {
      switch (c) {
        case 'i':
          arg = g.getOptarg();
          if (arg == null || arg.isEmpty()) {
            usage("Please provide an input path");
          }
          inputPath = arg;
          break;
        case 'r':
          run = true;
          arg = g.getOptarg();
          if (arg == null || arg.isEmpty()) {
            usage("Please provide an output path");
          }
          outputPath = arg;
          break;
        case 't':
          train = true;
          break;
        case 'e':
          eval = true;
          break;
        case '?':
        default:
          usage(null);
      }
    }

    if (args.length == 0 || (!run && !train && !eval)) {
      usage("Nothing to do.");
    }

    if (inputPath == null || inputPath.isEmpty()) {
      usage("Please provide an input path");
    }

    if (run && (outputPath == null || outputPath.isEmpty())) {
      usage("Please provide an output directory!");
    }

    if (train && eval) {
      usage("Only one mode allowed at a time");
    }

    if (train && run) {
      usage("Only one mode allowed at a time");
    }

    if (eval && run) {
      usage("Only one mode allowed at a time");
    }

    /* Initialize GATE */
    String location =
        new File(Main.class.getProtectionDomain().getCodeSource().getLocation().getPath())
            .getParent();
    String resourcesFolder = location + "/resources";
    Gate.setGateHome(new File(resourcesFolder));

    /* Create ml-config.xml with threads */

    createConfig(resourcesFolder + File.separator);
    Gate.init();

    /* Load Corpus */
    log.info("Loading Corpus ... ");
    Corpus corpus = Factory.newCorpus("Training Corpus");
    File directory = new File(inputPath);
    URL url = directory.toURI().toURL();
    corpus.populate(url, null, null, true);
    log.info("Done loading Corpus!");

    Pipeline pipeline = null;

    /* Do Tagging */
    pipeline = new Tagger();
    pipeline.run(corpus, resourcesFolder);

    /* Train */
    if (train) {
      pipeline = new Trainer();
      pipeline.run(corpus, resourcesFolder);
    }

    /* Apply learned rules */
    if (run) {
      pipeline = new Extractor();
      pipeline.run(corpus, resourcesFolder);

      ExecutorService executorService = Executors.newFixedThreadPool(20);
      for (int i = 0; i < corpus.size(); i++) {
        executorService.execute(new OutputGenerator(outputPath, corpus.get(i)));
      }
      executorService.shutdown();
      executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.DAYS);
    }

    /* Evaluate results */
    if (eval) {
      pipeline = new Evaluator();
      pipeline.run(corpus, resourcesFolder);
    }

    /* Clean up */
    Factory.deleteResource(corpus);
    outputFile_mlConfigThreads.delete();
  }
예제 #13
0
  /**
   * The main entry point. First we parse the command line options (see usage() method for details),
   * then we take all remaining command line parameters to be file names to process. Each file is
   * loaded, processed using the application and the results written to the output file
   * (inputFile.out.xml).
   */
  public static void main(String[] args) throws Exception {
    parseCommandLine(args);

    // initialise GATE - this must be done before calling any GATE APIs
    Gate.init();

    // load the saved application
    CorpusController application =
        (CorpusController) PersistenceManager.loadObjectFromFile(gappFile);

    // Create a Corpus to use.  We recycle the same Corpus object for each
    // iteration.  The string parameter to newCorpus() is simply the
    // GATE-internal name to use for the corpus.  It has no particular
    // significance.

    ArrayList<String> files = getFilesFromDir(inputDir);
    gate.Corpus corpus = createCorpus(files);
    // Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus");
    application.setCorpus(corpus);

    System.out.println("Processing " + files.size() + " files");

    // process the files one by one
    for (int i = 0; i < files.size(); i++) {

      // load the document (using the specified encoding if one was given)
      File docFile = new File(files.get(i));
      System.out.print("Processing document " + docFile + " (" + i + ") ...");
      Document doc = Factory.newDocument(docFile.toURL(), encoding);

      // put the document in the corpus
      corpus.add(doc);

      // run the application
      application.execute();

      // remove the document from the corpus again
      corpus.clear();

      String docXMLString = null;
      // if we want to just write out specific annotation types, we must
      // extract the annotations into a Set
      if (annotTypesToWrite != null) {
        // Create a temporary Set to hold the annotations we wish to write out
        Set annotationsToWrite = new HashSet();

        // we only extract annotations from the default (unnamed) AnnotationSet
        // in this example
        AnnotationSet defaultAnnots = doc.getAnnotations("Output");
        Iterator annotTypesIt = annotTypesToWrite.iterator();
        while (annotTypesIt.hasNext()) {
          // extract all the annotations of each requested type and add them to
          // the temporary set
          AnnotationSet annotsOfThisType = defaultAnnots.get((String) annotTypesIt.next());
          if (annotsOfThisType != null) {
            annotationsToWrite.addAll(annotsOfThisType);
          }
        }

        // create the XML string using these annotations
        docXMLString = doc.toXml(annotationsToWrite, true);
      }
      // otherwise, just write out the whole document as GateXML
      else {
        docXMLString = doc.toXml();
      }

      // Release the document, as it is no longer needed
      Factory.deleteResource(doc);

      // output the XML to <inputFile>.out.xml
      System.out.println("Writing file " + docFile.getName());
      String outputFileName = docFile.getName() + ".out.xml";
      // File outputFile = new File(docFile.getParentFile(), outputFileName);
      File outputFile = new File(new File(outputDir).getAbsolutePath(), outputFileName);

      // Write output files using the same encoding as the original
      FileOutputStream fos = new FileOutputStream(outputFile);
      BufferedOutputStream bos = new BufferedOutputStream(fos);
      OutputStreamWriter out;
      if (encoding == null) {
        out = new OutputStreamWriter(bos);
      } else {
        out = new OutputStreamWriter(bos, encoding);
      }

      out.write(docXMLString);

      out.close();
      System.out.println("done");
    } // for each file

    System.out.println("All done");
  } // void main(String[] args)
예제 #14
0
  private RetObj ProcessRecords() throws Exception {
    // Create a Corpus to use.  We recycle the same Corpus object for each
    // iteration.
    Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus");
    this.application.setCorpus(corpus);

    // object for returned data
    List<String> processedlines = new ArrayList<String>();
    List<String> processedText = new ArrayList<String>();

    for (int record_num = 0; record_num < this.recs.size(); ++record_num) {
      /*if( record_num % Math.ceil(((double) this.recs.size())/10.0) == 0)
           System.out.println("Thread " + this.threadID + ": "+ ((int) ((double)record_num)/((double) this.recs.size())*100.0 ) +"% complete.");
      */

      // first, split title from body and get embedded age in title..
      String title_age = "-1";
      String sep = "..THIS IS MY SEPARATION STRING..";
      String title = "";
      String body = this.recs.get(record_num);
      Boolean trimmed = false;
      int age_end = body.indexOf(",>           ");
      if (age_end >= 0 && age_end < body.length()) {
        int age_start = body.lastIndexOf("-", age_end);
        if (age_start >= 0 && age_start < age_end) {
          title_age = body.substring(age_start + 1, age_end).trim();
          if (!isInteger(title_age)) title_age = "-1";
          else {
            title = body.substring(0, age_start);
            body = body.substring(age_end + 2, body.length());
            body = title + sep + body;
            trimmed = true;
          }
        }
        if (!trimmed) {
          title = body.substring(0, age_end);
          body = body.substring(age_end + 2, body.length());
          body = title + sep + body;
          trimmed = true;
        }
      }
      // --------------------

      org.jsoup.nodes.Document htmldoc =
          Jsoup.parseBodyFragment(body.replaceAll("COMMA_GOES_HERE", ","));
      Elements links = htmldoc.select("a[href]");
      Elements media = htmldoc.select("[src]");
      Elements imports = htmldoc.select("link[href]");

      processedText.add(htmldoc.text().replace(sep, " "));
      Document doc = Factory.newDocument(htmldoc.text());

      // put the document in the corpus
      corpus.add(doc);

      // run the application
      this.application.execute();

      // remove the document from the corpus again
      corpus.clear();

      // extract annotations
      String line = "";
      AnnotationSet Annots = doc.getAnnotations("");

      Integer FirstPersonCount = 0, ThirdPersonCount = 0;
      AnnotationSet FirstPerson = Annots.get("FirstPerson");
      if (FirstPerson != null) FirstPersonCount = FirstPerson.size();
      AnnotationSet ThirdPerson = Annots.get("ThirdPerson");
      if (ThirdPerson != null) ThirdPersonCount = ThirdPerson.size();
      line += FirstPersonCount.toString() + "," + ThirdPersonCount.toString() + ",";

      AnnotationSet Names = Annots.get("Name");
      if (Names == null || Names.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = Names.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("name");
          if (Feat != null) line += Feat.toString();
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet Age = Annots.get("Age");
      if (Age == null || Age.size() < 1) line += title_age + ",";
      else {
        Iterator<Annotation> Iter = Age.inDocumentOrder().iterator();
        line += title_age + ";";
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("age");
          if (Feat != null) line += Feat.toString();
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet Cost = Annots.get("Cost");
      if (Cost == null || Cost.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = Cost.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("value");
          if (Feat != null) line += Feat.toString();
          else line += "none";
          line += "/";
          Feat = Ann.getFeatures().get("target_value");
          if (Feat != null) line += Feat.toString();
          else line += "none";
          line += "/";
          Feat = Ann.getFeatures().get("target_type");
          if (Feat != null) line += Feat.toString();
          else line += "none";
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet height = Annots.get("height");
      if (height == null || height.size() < 1) line += ",,";
      else {
        String ft = "";
        String inch = "";
        Iterator<Annotation> Iter = height.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("feet");
          if (Feat != null) ft += Feat.toString();
          else ft += "none";
          Feat = Ann.getFeatures().get("inches");
          if (Feat != null) inch += Feat.toString();
          else inch += "none";
          if (Iter.hasNext()) {
            ft += ";";
            inch += ";";
          }
        }
        line += ft + "," + inch + ",";
      }

      AnnotationSet weight = Annots.get("weight");
      if (weight == null || weight.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = weight.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("pounds");
          if (Feat != null) line += Feat.toString();
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet measurement = Annots.get("measurement");
      if (measurement == null || measurement.size() < 1) line += ",,,,";
      else {
        String cup = "";
        String chest = "";
        String waist = "";
        String hip = "";
        Iterator<Annotation> Iter = measurement.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("cup");
          if (Feat != null) cup += Feat.toString();
          else cup += "none";
          Feat = Ann.getFeatures().get("chest");
          if (Feat != null) chest += Feat.toString();
          else chest += "none";
          Feat = Ann.getFeatures().get("waist");
          if (Feat != null) waist += Feat.toString();
          else waist += "none";
          Feat = Ann.getFeatures().get("hip");
          if (Feat != null) hip += Feat.toString();
          else hip += "none";
          if (Iter.hasNext()) {
            cup += ";";
            chest += ";";
            waist += ";";
            hip += ";";
          }
        }
        line += cup + "," + chest + "," + waist + "," + hip + ",";
      }

      AnnotationSet Ethnicity = Annots.get("Ethnicity");
      if (Ethnicity == null || Ethnicity.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = Ethnicity.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("ethnicity");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet SkinColor = Annots.get("SkinColor");
      if (SkinColor == null || SkinColor.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = SkinColor.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("color");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet EyeColor = Annots.get("EyeColor");
      if (EyeColor == null || EyeColor.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = EyeColor.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("color");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet HairColor = Annots.get("HairColor");
      if (HairColor == null || HairColor.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = HairColor.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("color");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet Restriction = Annots.get("Restriction");
      if (Restriction == null || Restriction.size() < 1) line += ",,,";
      else {
        String type = "";
        String ethnicity = "";
        String age = "";
        Iterator<Annotation> Iter = Restriction.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("type");
          if (Feat != null) type += Feat.toString();
          else type += "none";
          Feat = Ann.getFeatures().get("ethnicity");
          if (Feat != null)
            ethnicity += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          else ethnicity += "none";
          Feat = Ann.getFeatures().get("age");
          if (Feat != null) age += Feat.toString();
          else age += "none";
          if (Iter.hasNext()) {
            type += ";";
            ethnicity += ";";
            age += ";";
          }
        }
        line += type + "," + ethnicity + "," + age + ",";
      }

      AnnotationSet Phone = Annots.get("PhoneNumber");
      if (Phone == null || Phone.size() < 1) line += ",,,";
      else {
        String value = "";
        String state = "";
        String city = "";
        Iterator<Annotation> Iter = Phone.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("value");
          if (Feat != null) value += Feat.toString();
          else value += "none";
          Feat = Ann.getFeatures().get("state");
          if (Feat != null) state += Feat.toString();
          else state += "none";
          Feat = Ann.getFeatures().get("area");
          if (Feat != null)
            city += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          else city += "none";
          if (Iter.hasNext()) {
            value += ";";
            state += ";";
            city += ";";
          }
        }
        line += value + "," + state + "," + city + ",";
      }

      String Emails = "";
      AnnotationSet Email = Annots.get("Email");
      if (Email == null || Email.size() < 1) Emails = "";
      else {
        Iterator<Annotation> Iter = Email.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("email");
          if (Feat != null)
            Emails += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (links != null) {
        for (Element l : links) {
          String href = l.attr("abs:href");
          if (href == null) continue;
          if (href.length() > 7 && href.substring(0, 7).toLowerCase().equals("mailto:")) {
            Emails +=
                href.substring(7, href.length()).replaceAll(",", " ").replaceAll(";", " ") + ";";
          }
        }
      }
      if (Emails.length() > 0 && Emails.substring(Emails.length() - 1, Emails.length()).equals(";"))
        Emails = Emails.substring(0, Emails.length() - 1);
      line += Emails + ",";

      String Urls = "";
      AnnotationSet Url = Annots.get("Url");
      if (Url == null || Url.size() < 1) Urls = "";
      else {
        Iterator<Annotation> Iter = Url.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("url");
          if (Feat != null)
            Urls += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (links != null) {
        for (Element l : links) {
          String href = l.attr("abs:href");
          if (href == null) continue;
          if (href.length() <= 7 || !href.substring(0, 7).toLowerCase().equals("mailto:")) {
            Urls += href.replaceAll(",", " ").replaceAll(";", " ") + ";";
          }
        }
      }
      if (imports != null) {
        for (Element l : imports) {
          String href = l.attr("abs:href");
          if (href == null) continue;
          Urls += href.replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (Urls.length() > 0 && Urls.substring(Urls.length() - 1, Urls.length()).equals(";"))
        Urls = Urls.substring(0, Urls.length() - 1);
      line += Urls + ",";

      String Medias = "";
      if (media != null) {
        for (Element l : media) {
          String src = l.attr("abs:src");
          if (src == null) continue;
          Medias += src.replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (Medias.length() > 0 && Medias.substring(Medias.length() - 1, Medias.length()).equals(";"))
        Medias = Medias.substring(0, Medias.length() - 1);
      line += Medias;

      processedlines.add(line);
      // Release the document, as it is no longer needed
      Factory.deleteResource(doc);
    }
    Factory.deleteResource(corpus);

    RetObj out = new RetObj(processedlines, processedText);
    return out;
  }
예제 #15
0
  // Process an input document with GATE and a Reporter
  public synchronized BehemothDocument[] process(BehemothDocument inputDoc, Reporter reporter) {
    if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString());

    boolean clearBehemothAnnotations = config.getBoolean("gate.deleteBehemothAnnotations", false);

    // process the text passed as value with the application
    // a) create a GATE document based on the text value
    gate.Document gatedocument = null;
    try {

      gatedocument = generateGATEDoc(inputDoc);
      // add it to the current corpus
      corpus.add(gatedocument);
      // get the application and assign the corpus to it
      this.GATEapplication.setCorpus(corpus);
      // process it with GATE
      this.GATEapplication.execute();

      AnnotationSet annots = null;
      if ("".equals(filters.getAnnotationSetName())) annots = gatedocument.getAnnotations();
      else annots = gatedocument.getAnnotations(filters.getAnnotationSetName());

      // enrich the input doc with the annotations from
      // the GATE application
      // transfer the annotations from the GATE document
      // to the Behemoth one using the filters
      List<com.digitalpebble.behemoth.Annotation> beheannotations =
          convertGATEAnnotationsToBehemoth(annots, inputDoc);

      // sort the annotations before adding them?
      Collections.sort(beheannotations);

      // clear the existing behemoth annotations
      if (clearBehemothAnnotations) {
        inputDoc.getAnnotations().clear();
      }

      inputDoc.getAnnotations().addAll(beheannotations);

      // add counters about num of annotations added
      if (reporter != null)
        for (com.digitalpebble.behemoth.Annotation annot : beheannotations) {
          reporter.incrCounter("GATE", annot.getType(), 1);
        }

      // Add the document features from GATE to Behemoth
      Set<String> docFeatFilter = this.filters.getDocFeaturesFilter();
      MapWritable beheMD = inputDoc.getMetadata(true);
      if (docFeatFilter.size() > 0) {
        for (String docFeatName : docFeatFilter) {
          Object featValue = gatedocument.getFeatures().get(docFeatName);
          if (featValue != null) {
            beheMD.put(new Text(docFeatName), new Text(featValue.toString()));
          }
        }
      }

      if (reporter != null) reporter.incrCounter("GATE", "Document", 1);

    } catch (Exception e) {
      LOG.error(inputDoc.getUrl().toString(), e);
      if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1);
    } finally {
      // remove the document from the corpus again
      corpus.clear();
      // and from memory
      if (gatedocument != null) Factory.deleteResource(gatedocument);
    }
    // currently returns only the input document
    return new BehemothDocument[] {inputDoc};
  }
  /**
   * Run from the command-line, with a list of URLs as argument.
   *
   * <p><B>NOTE:</B><br>
   * This code will run with all the documents in memory - if you want to unload each from memory
   * after use, add code to store the corpus in a DataStore.
   */
  public static void main(String args[]) throws GateException, IOException {
    // initialise the GATE library
    Out.prln("Initialising GATE...");
    Gate.init();
    Out.prln("...GATE initialised");

    // initialise ANNIE (this may take several minutes)
    StandAloneAnnie annie = new StandAloneAnnie();
    annie.initAnnie();

    // create a GATE corpus and add a document for each command-line
    // argument
    Corpus corpus = Factory.newCorpus("StandAloneAnnie corpus");
    for (int i = 0; i < args.length; i++) {
      URL u = new URL(args[i]);
      FeatureMap params = Factory.newFeatureMap();
      params.put("sourceUrl", u);
      params.put("preserveOriginalContent", new Boolean(true));
      params.put("collectRepositioningInfo", new Boolean(true));
      Out.prln("Creating doc for " + u);
      Document doc = (Document) Factory.createResource("gate.corpora.DocumentImpl", params);
      corpus.add(doc);
    } // for each of args

    // tell the pipeline about the corpus and run it
    annie.setCorpus(corpus);
    annie.execute();

    // for each document, get an XML document with the
    // person and location names added
    Iterator iter = corpus.iterator();
    int count = 0;
    String startTagPart_1 = "<span GateID=\"";
    String startTagPart_2 = "\" title=\"";
    String startTagPart_3 = "\" style=\"background:Red;\">";
    String endTag = "</span>";

    while (iter.hasNext()) {
      Document doc = (Document) iter.next();
      AnnotationSet defaultAnnotSet = doc.getAnnotations();
      Set annotTypesRequired = new HashSet();
      annotTypesRequired.add("Person");
      annotTypesRequired.add("Location");
      Set<Annotation> peopleAndPlaces =
          new HashSet<Annotation>(defaultAnnotSet.get(annotTypesRequired));

      FeatureMap features = doc.getFeatures();
      String originalContent =
          (String) features.get(GateConstants.ORIGINAL_DOCUMENT_CONTENT_FEATURE_NAME);
      RepositioningInfo info =
          (RepositioningInfo) features.get(GateConstants.DOCUMENT_REPOSITIONING_INFO_FEATURE_NAME);

      ++count;
      File file = new File("StANNIE_" + count + ".HTML");
      Out.prln("File name: '" + file.getAbsolutePath() + "'");
      if (originalContent != null && info != null) {
        Out.prln("OrigContent and reposInfo existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionStart = info.getOriginalPos(insertPositionStart);
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          insertPositionEnd = info.getOriginalPos(insertPositionEnd, true);
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } // if - should generate
      else if (originalContent != null) {
        Out.prln("OrigContent existing. Generate file...");

        Iterator it = peopleAndPlaces.iterator();
        Annotation currAnnot;
        SortedAnnotationList sortedAnnotations = new SortedAnnotationList();

        while (it.hasNext()) {
          currAnnot = (Annotation) it.next();
          sortedAnnotations.addSortedExclusive(currAnnot);
        } // while

        StringBuffer editableContent = new StringBuffer(originalContent);
        long insertPositionEnd;
        long insertPositionStart;
        // insert anotation tags backward
        Out.prln("Unsorted annotations count: " + peopleAndPlaces.size());
        Out.prln("Sorted annotations count: " + sortedAnnotations.size());
        for (int i = sortedAnnotations.size() - 1; i >= 0; --i) {
          currAnnot = (Annotation) sortedAnnotations.get(i);
          insertPositionStart = currAnnot.getStartNode().getOffset().longValue();
          insertPositionEnd = currAnnot.getEndNode().getOffset().longValue();
          if (insertPositionEnd != -1 && insertPositionStart != -1) {
            editableContent.insert((int) insertPositionEnd, endTag);
            editableContent.insert((int) insertPositionStart, startTagPart_3);
            editableContent.insert((int) insertPositionStart, currAnnot.getType());
            editableContent.insert((int) insertPositionStart, startTagPart_2);
            editableContent.insert((int) insertPositionStart, currAnnot.getId().toString());
            editableContent.insert((int) insertPositionStart, startTagPart_1);
          } // if
        } // for

        FileWriter writer = new FileWriter(file);
        writer.write(editableContent.toString());
        writer.close();
      } else {
        Out.prln("Content : " + originalContent);
        Out.prln("Repositioning: " + info);
      }

      String xmlDocument = doc.toXml(peopleAndPlaces, false);
      String fileName = new String("StANNIE_toXML_" + count + ".HTML");
      FileWriter writer = new FileWriter(fileName);
      writer.write(xmlDocument);
      writer.close();
    } // for each doc
  } // main
예제 #17
0
  public static void main(String[] args) throws Exception {

    GateUtils.initGate();

    ie =
        new SpcInteractionExport(
            "SpcInteractionExport/SpcInteractionExport_" + ProjectSetup.makeTimeStamp() + ".csv");

    /**/

    initController();

    /**/

    CsvReader in =
        new CsvReader("C:/Users/dedek/Desktop/DATLOWE/LP_SPC.csv", ';', Charset.forName("cp1250"));
    in.readHeaders();

    StopRequestDetector srd = new StopRequestDetector();
    srd.startDetector();

    int num = 0;

    while (!srd.stop_requested) {
      if (!in.readRecord()) break;

      String spcCode = in.get(0);
      ie.setSpcCode(spcCode);

      String doc = in.get(3);
      String pdf = in.get(4);

      String fileStr = pdf;
      File file = null;

      if (!fileStr.isEmpty()) {
        file = new File("C:/Users/dedek/Desktop/DATLOWE/SPC_all/" + fileStr);
        if (!file.exists()) fileStr = "";
      }

      if (fileStr.isEmpty()) {
        fileStr = doc;
      }

      if (!fileStr.isEmpty()) {
        file = new File("C:/Users/dedek/Desktop/DATLOWE/SPC_all/" + fileStr);
        if (!file.exists()) fileStr = "";
      }

      System.err.format(
          "%4d %s record: %s %s %s\n",
          ++num, ProjectSetup.makeTimeStamp(), spcCode, in.get(1), fileStr);
      if (fileStr.isEmpty()) {
        System.err.format("WARNING ommiting record: %s %s\n", in.get(0), in.get(1));
        continue;
      }

      Document gateDoc = Factory.newDocument(file.toURI().toURL(), "utf8");
      corpus.add(gateDoc);

      try {
        controller.execute();
      } catch (ExecutionException e) {
        System.err.println("----------------------- EXECUTION INTERUPTED -------------------");
        e.printStackTrace();
        initController();
        System.err.println("----------------------- EXECUTION RESTARTED -------------------");
      }

      corpus.clear();
      // GateUtils.saveGateDocumentToXML(gateDoc,
      // "C:/Users/dedek/Desktop/DATLOWE/SPC_ananlyzed/"+spcCode+".xml");
      Factory.deleteResource(gateDoc);
    }

    in.close();
    ie.close();

    GateUtils.deleteAllPublicGateResources();
  }