@SuppressWarnings("unchecked")
 public Set<String> processDoc(String str) throws Exception {
   Set<String> toReturn = new HashSet<String>();
   Corpus c = null;
   Document aDoc = null;
   try {
     c = Factory.newCorpus("sample");
     aDoc = Factory.newDocument(str);
     c.add(aDoc);
     controller.setCorpus(c);
     controller.execute();
     AnnotationSet aSet = aDoc.getAnnotations("StockSymbols");
     for (Annotation annot : aSet) {
       String symbol = (String) annot.getFeatures().get("sym");
       toReturn.add(symbol);
     }
   } catch (Exception e) {
     throw e;
   } finally {
     if (aDoc != null) {
       Factory.deleteResource(aDoc);
     }
     if (c != null) {
       Factory.deleteResource(c);
     }
   }
   return toReturn;
 }
 /** Clear up the resources used after one test. */
 private void clearOneTest() {
   corpus.clear();
   Factory.deleteResource(corpus);
   Factory.deleteResource(learningApi);
   controller.remove(learningApi);
   controller.cleanup();
   Factory.deleteResource(controller);
 }
 /**
  * Call the given closure passing this resource as a parameter, and ensuring that the resource is
  * deleted when the closure returns. This would typically be used in this kind of construction:
  *
  * <pre>
  * Factory.newDocument(someUrl).withResource {
  *   // do something with the document (it)
  * }
  * </pre>
  *
  * @param self
  * @param closure
  * @return the value returned from the closure
  */
 public static <T> T withResource(Resource self, Closure<T> closure) {
   try {
     return closure.call(self);
   } finally {
     Factory.deleteResource(self);
   }
 }
  public synchronized String processNative(BehemothDocument inputDoc, Reporter reporter) {
    if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString());
    // process the text passed as value with the application
    // a) create a GATE document based on the text value
    gate.Document gatedocument = null;
    try {

      gatedocument = generateGATEDoc(inputDoc);
      // add it to the current corpus
      corpus.add(gatedocument);
      // get the application and assign the corpus to it
      this.GATEapplication.setCorpus(corpus);
      // process it with GATE
      this.GATEapplication.execute();

      // transfer the annotations from the GATE document
      // to the Behemoth one using the filters
      if (reporter != null) reporter.incrCounter("GATE", "Document", 1);

      return gatedocument.toXml();

    } catch (Exception e) {
      LOG.error(inputDoc.getUrl().toString(), e);
      if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1);
    } finally {
      // remove the document from the corpus again
      corpus.clear();
      // and from memory
      if (gatedocument != null) Factory.deleteResource(gatedocument);
    }
    return null;
  }
  public JSONObject persian_sentiment(String text) throws Exception {

    oncreate();

    File PersianGapp = new File("C:/Users/mohammad/Desktop/New folder/Gate/application.xgapp");
    // initialise GATE - this must be done before calling any GATE APIs
    Gate.init();

    // load the saved application

    CorpusController application =
        (CorpusController) PersistenceManager.loadObjectFromFile(PersianGapp);

    // Create a Corpus to use.  We recycle the same Corpus object for each
    // iteration.  The string parameter to newCorpus() is simply the
    // GATE-internal name to use for the corpus.  It has no particular
    // significance.
    Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus");
    application.setCorpus(corpus);

    // process the files one by one

    // load the document (using the specified encoding if one was given)

    Document doc = Factory.newDocument(text);

    // put the document in the corpus
    corpus.add(doc);

    // run the application
    application.execute();

    String featureName = "Doc_sentiment";
    FeatureMap features = doc.getFeatures();
    // remove the document from the corpus again
    corpus.clear();

    // doc.getFeatures().
    // Release the document, as it is no longer needed
    Factory.deleteResource(doc);

    LinkedHashMap originalContent = (LinkedHashMap) features.get(featureName);

    String obj = (String) originalContent.get("sentiment");
    // BigDecimal pos =(BigDecimal) originalContent.get("positive");
    // BigDecimal neg =(BigDecimal) originalContent.get("negative");
    // System.out.println(obj);
    // create Json for response to user
    JSONObject obj1 = new JSONObject();
    obj1.put("sentiment", obj);
    /*obj1.put("positive",pos);
    //obj1.put("negative",neg);
    System.out.print("----------");
    System.out.print(obj1);
    System.out.print("----------");*/
    // application.cleanup();
    return obj1;
  }
Beispiel #6
0
 @Override
 public void datastoreClosed(CreoleEvent e) {
   if (!e.getDatastore().equals(this.getDataStore())) return;
   if (this.getDataStore() != null) this.getDataStore().removeDatastoreListener(this);
   // close this corpus, since it cannot stay open when the DS it comes
   // from
   // is closed
   Factory.deleteResource(this);
 }
 /**
  * Call the closure once for each document in this corpus, loading and unloading documents as
  * appropriate in the case of a persistent corpus, and adding the return values of each call to
  * the given collection.
  *
  * @param self the corpus to traverse
  * @param closure the closure to call
  * @return a list of the return values from each closure call.
  */
 public static <T> Collection<T> collect(Corpus self, Collection<T> coll, Closure<T> closure) {
   for (int i = 0; i < self.size(); i++) {
     boolean docWasLoaded = self.isDocumentLoaded(i);
     Document doc = self.get(i);
     coll.add(closure.call(doc));
     if (!docWasLoaded) {
       self.unloadDocument(doc);
       Factory.deleteResource(doc);
     }
   }
   return coll;
 }
 /**
  * Call the closure once for each document in this corpus, loading and unloading documents as
  * appropriate in the case of a persistent corpus.
  *
  * @param self the corpus to traverse
  * @param closure the closure to call
  * @return the corpus.
  */
 public static <T> Object each(Corpus self, Closure<T> closure) {
   for (int i = 0; i < self.size(); i++) {
     boolean docWasLoaded = self.isDocumentLoaded(i);
     Document doc = self.get(i);
     closure.call(doc);
     if (!docWasLoaded) {
       self.unloadDocument(doc);
       Factory.deleteResource(doc);
     }
   }
   return self;
 }
Beispiel #9
0
  /** Called by a datastore when a resource has been deleted */
  @Override
  public void resourceDeleted(DatastoreEvent evt) {
    DataStore ds = (DataStore) evt.getSource();
    // 1. check whether this datastore fired the event. If not, return.
    if (!ds.equals(this.dataStore)) return;

    Object docID = evt.getResourceID();
    if (docID == null) return;

    if (DEBUG) Out.prln("Resource deleted called for: " + docID);
    // first check if it is this corpus that's been deleted, it must be
    // unloaded immediately
    if (docID.equals(this.getLRPersistenceId())) {
      Factory.deleteResource(this);
      return;
    } // if

    boolean isDirty = false;
    // the problem here is that I only have the doc persistent ID
    // and nothing else, so I need to determine the index of the doc
    // first
    for (int i = 0; i < docDataList.size(); i++) {
      DocumentData docData = docDataList.get(i);
      // we've found the correct document
      // don't break the loop, because it might appear more than once
      if (docID.equals(docData.getPersistentID())) {
        if (evt.getResource() == null) {
          // instead of calling remove() which tries to load the
          // document
          // remove it from the documents and docDataList
          documentRemoved(docDataList.get(i).persistentID.toString());
          docDataList.remove(i);
          documents.remove(i);
          isDirty = true;
          i--;
          continue;
        }

        remove(i);
        isDirty = true;
      } // if
    } // for loop through the doc data

    if (isDirty)
      try {
        this.dataStore.sync(this);
      } catch (PersistenceException ex) {
        throw new GateRuntimeException("SerialCorpusImpl: " + ex.getMessage());
      } catch (SecurityException sex) {
        throw new GateRuntimeException("SerialCorpusImpl: " + sex.getMessage());
      }
  } // resourceDeleted
Beispiel #10
0
 public void actionPerformed(ActionEvent e) {
   int[] rowsTable = docTable.getSelectedRows();
   int[] rowsCorpus = new int[rowsTable.length];
   for (int i = 0; i < rowsTable.length; i++)
     rowsCorpus[i] = docTable.rowViewToModel(rowsTable[i]);
   Arrays.sort(rowsCorpus);
   // starting from the largest one, move each element down
   for (int i = rowsCorpus.length - 1; i >= 0; i--) {
     if (rowsCorpus[i] < corpus.size() - 1) {
       // swap the doc with the one before
       // serial corpus does not load the document on remove, so we need
       // to load the document explicitly
       boolean wasLoaded = corpus.isDocumentLoaded(rowsCorpus[i]);
       Document doc = (Document) corpus.get(rowsCorpus[i]);
       corpus.remove(rowsCorpus[i]);
       rowsCorpus[i]++;
       corpus.add(rowsCorpus[i], doc);
       if (!wasLoaded) {
         corpus.unloadDocument(doc);
         Factory.deleteResource(doc);
       }
     }
   }
   // restore selection
   // the remove / add events will cause the table to be updated
   // we need to only restore the selection after that happened
   final int[] selectedRowsCorpus = new int[rowsCorpus.length];
   System.arraycopy(rowsCorpus, 0, selectedRowsCorpus, 0, rowsCorpus.length);
   SwingUtilities.invokeLater(
       new Runnable() {
         public void run() {
           docTable.clearSelection();
           for (int i = 0; i < selectedRowsCorpus.length; i++) {
             int rowTable = docTable.rowModelToView(selectedRowsCorpus[i]);
             docTable.getSelectionModel().addSelectionInterval(rowTable, rowTable);
           }
         }
       });
 }
Beispiel #11
0
  /**
   * @param args
   * @throws Exception
   */
  public static void main(String[] args) throws Exception {
    /* Parse command line arguments */
    Getopt g = new Getopt("gateExtractor", args, "i:r:te");
    g.setOpterr(false);

    String inputPath = "";
    String outputPath = "";

    boolean train = false;
    boolean eval = false;
    boolean run = false;

    int c;
    String arg;
    while ((c = g.getopt()) != -1) {
      switch (c) {
        case 'i':
          arg = g.getOptarg();
          if (arg == null || arg.isEmpty()) {
            usage("Please provide an input path");
          }
          inputPath = arg;
          break;
        case 'r':
          run = true;
          arg = g.getOptarg();
          if (arg == null || arg.isEmpty()) {
            usage("Please provide an output path");
          }
          outputPath = arg;
          break;
        case 't':
          train = true;
          break;
        case 'e':
          eval = true;
          break;
        case '?':
        default:
          usage(null);
      }
    }

    if (args.length == 0 || (!run && !train && !eval)) {
      usage("Nothing to do.");
    }

    if (inputPath == null || inputPath.isEmpty()) {
      usage("Please provide an input path");
    }

    if (run && (outputPath == null || outputPath.isEmpty())) {
      usage("Please provide an output directory!");
    }

    if (train && eval) {
      usage("Only one mode allowed at a time");
    }

    if (train && run) {
      usage("Only one mode allowed at a time");
    }

    if (eval && run) {
      usage("Only one mode allowed at a time");
    }

    /* Initialize GATE */
    String location =
        new File(Main.class.getProtectionDomain().getCodeSource().getLocation().getPath())
            .getParent();
    String resourcesFolder = location + "/resources";
    Gate.setGateHome(new File(resourcesFolder));

    /* Create ml-config.xml with threads */

    createConfig(resourcesFolder + File.separator);
    Gate.init();

    /* Load Corpus */
    log.info("Loading Corpus ... ");
    Corpus corpus = Factory.newCorpus("Training Corpus");
    File directory = new File(inputPath);
    URL url = directory.toURI().toURL();
    corpus.populate(url, null, null, true);
    log.info("Done loading Corpus!");

    Pipeline pipeline = null;

    /* Do Tagging */
    pipeline = new Tagger();
    pipeline.run(corpus, resourcesFolder);

    /* Train */
    if (train) {
      pipeline = new Trainer();
      pipeline.run(corpus, resourcesFolder);
    }

    /* Apply learned rules */
    if (run) {
      pipeline = new Extractor();
      pipeline.run(corpus, resourcesFolder);

      ExecutorService executorService = Executors.newFixedThreadPool(20);
      for (int i = 0; i < corpus.size(); i++) {
        executorService.execute(new OutputGenerator(outputPath, corpus.get(i)));
      }
      executorService.shutdown();
      executorService.awaitTermination(Long.MAX_VALUE, TimeUnit.DAYS);
    }

    /* Evaluate results */
    if (eval) {
      pipeline = new Evaluator();
      pipeline.run(corpus, resourcesFolder);
    }

    /* Clean up */
    Factory.deleteResource(corpus);
    outputFile_mlConfigThreads.delete();
  }
 @Override
 public void cleanup() {
   Factory.deleteResource(japeTransducer);
 }
  /**
   * The main entry point. First we parse the command line options (see usage() method for details),
   * then we take all remaining command line parameters to be file names to process. Each file is
   * loaded, processed using the application and the results written to the output file
   * (inputFile.out.xml).
   */
  public static void main(String[] args) throws Exception {
    parseCommandLine(args);

    // initialise GATE - this must be done before calling any GATE APIs
    Gate.init();

    // load the saved application
    CorpusController application =
        (CorpusController) PersistenceManager.loadObjectFromFile(gappFile);

    // Create a Corpus to use.  We recycle the same Corpus object for each
    // iteration.  The string parameter to newCorpus() is simply the
    // GATE-internal name to use for the corpus.  It has no particular
    // significance.

    ArrayList<String> files = getFilesFromDir(inputDir);
    gate.Corpus corpus = createCorpus(files);
    // Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus");
    application.setCorpus(corpus);

    System.out.println("Processing " + files.size() + " files");

    // process the files one by one
    for (int i = 0; i < files.size(); i++) {

      // load the document (using the specified encoding if one was given)
      File docFile = new File(files.get(i));
      System.out.print("Processing document " + docFile + " (" + i + ") ...");
      Document doc = Factory.newDocument(docFile.toURL(), encoding);

      // put the document in the corpus
      corpus.add(doc);

      // run the application
      application.execute();

      // remove the document from the corpus again
      corpus.clear();

      String docXMLString = null;
      // if we want to just write out specific annotation types, we must
      // extract the annotations into a Set
      if (annotTypesToWrite != null) {
        // Create a temporary Set to hold the annotations we wish to write out
        Set annotationsToWrite = new HashSet();

        // we only extract annotations from the default (unnamed) AnnotationSet
        // in this example
        AnnotationSet defaultAnnots = doc.getAnnotations("Output");
        Iterator annotTypesIt = annotTypesToWrite.iterator();
        while (annotTypesIt.hasNext()) {
          // extract all the annotations of each requested type and add them to
          // the temporary set
          AnnotationSet annotsOfThisType = defaultAnnots.get((String) annotTypesIt.next());
          if (annotsOfThisType != null) {
            annotationsToWrite.addAll(annotsOfThisType);
          }
        }

        // create the XML string using these annotations
        docXMLString = doc.toXml(annotationsToWrite, true);
      }
      // otherwise, just write out the whole document as GateXML
      else {
        docXMLString = doc.toXml();
      }

      // Release the document, as it is no longer needed
      Factory.deleteResource(doc);

      // output the XML to <inputFile>.out.xml
      System.out.println("Writing file " + docFile.getName());
      String outputFileName = docFile.getName() + ".out.xml";
      // File outputFile = new File(docFile.getParentFile(), outputFileName);
      File outputFile = new File(new File(outputDir).getAbsolutePath(), outputFileName);

      // Write output files using the same encoding as the original
      FileOutputStream fos = new FileOutputStream(outputFile);
      BufferedOutputStream bos = new BufferedOutputStream(fos);
      OutputStreamWriter out;
      if (encoding == null) {
        out = new OutputStreamWriter(bos);
      } else {
        out = new OutputStreamWriter(bos, encoding);
      }

      out.write(docXMLString);

      out.close();
      System.out.println("done");
    } // for each file

    System.out.println("All done");
  } // void main(String[] args)
Beispiel #14
0
 public void close() {
   if (GATEapplication != null) Factory.deleteResource(GATEapplication);
 }
Beispiel #15
0
  // Process an input document with GATE and a Reporter
  public synchronized BehemothDocument[] process(BehemothDocument inputDoc, Reporter reporter) {
    if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString());

    boolean clearBehemothAnnotations = config.getBoolean("gate.deleteBehemothAnnotations", false);

    // process the text passed as value with the application
    // a) create a GATE document based on the text value
    gate.Document gatedocument = null;
    try {

      gatedocument = generateGATEDoc(inputDoc);
      // add it to the current corpus
      corpus.add(gatedocument);
      // get the application and assign the corpus to it
      this.GATEapplication.setCorpus(corpus);
      // process it with GATE
      this.GATEapplication.execute();

      AnnotationSet annots = null;
      if ("".equals(filters.getAnnotationSetName())) annots = gatedocument.getAnnotations();
      else annots = gatedocument.getAnnotations(filters.getAnnotationSetName());

      // enrich the input doc with the annotations from
      // the GATE application
      // transfer the annotations from the GATE document
      // to the Behemoth one using the filters
      List<com.digitalpebble.behemoth.Annotation> beheannotations =
          convertGATEAnnotationsToBehemoth(annots, inputDoc);

      // sort the annotations before adding them?
      Collections.sort(beheannotations);

      // clear the existing behemoth annotations
      if (clearBehemothAnnotations) {
        inputDoc.getAnnotations().clear();
      }

      inputDoc.getAnnotations().addAll(beheannotations);

      // add counters about num of annotations added
      if (reporter != null)
        for (com.digitalpebble.behemoth.Annotation annot : beheannotations) {
          reporter.incrCounter("GATE", annot.getType(), 1);
        }

      // Add the document features from GATE to Behemoth
      Set<String> docFeatFilter = this.filters.getDocFeaturesFilter();
      MapWritable beheMD = inputDoc.getMetadata(true);
      if (docFeatFilter.size() > 0) {
        for (String docFeatName : docFeatFilter) {
          Object featValue = gatedocument.getFeatures().get(docFeatName);
          if (featValue != null) {
            beheMD.put(new Text(docFeatName), new Text(featValue.toString()));
          }
        }
      }

      if (reporter != null) reporter.incrCounter("GATE", "Document", 1);

    } catch (Exception e) {
      LOG.error(inputDoc.getUrl().toString(), e);
      if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1);
    } finally {
      // remove the document from the corpus again
      corpus.clear();
      // and from memory
      if (gatedocument != null) Factory.deleteResource(gatedocument);
    }
    // currently returns only the input document
    return new BehemothDocument[] {inputDoc};
  }
  private RetObj ProcessRecords() throws Exception {
    // Create a Corpus to use.  We recycle the same Corpus object for each
    // iteration.
    Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus");
    this.application.setCorpus(corpus);

    // object for returned data
    List<String> processedlines = new ArrayList<String>();
    List<String> processedText = new ArrayList<String>();

    for (int record_num = 0; record_num < this.recs.size(); ++record_num) {
      /*if( record_num % Math.ceil(((double) this.recs.size())/10.0) == 0)
           System.out.println("Thread " + this.threadID + ": "+ ((int) ((double)record_num)/((double) this.recs.size())*100.0 ) +"% complete.");
      */

      // first, split title from body and get embedded age in title..
      String title_age = "-1";
      String sep = "..THIS IS MY SEPARATION STRING..";
      String title = "";
      String body = this.recs.get(record_num);
      Boolean trimmed = false;
      int age_end = body.indexOf(",>           ");
      if (age_end >= 0 && age_end < body.length()) {
        int age_start = body.lastIndexOf("-", age_end);
        if (age_start >= 0 && age_start < age_end) {
          title_age = body.substring(age_start + 1, age_end).trim();
          if (!isInteger(title_age)) title_age = "-1";
          else {
            title = body.substring(0, age_start);
            body = body.substring(age_end + 2, body.length());
            body = title + sep + body;
            trimmed = true;
          }
        }
        if (!trimmed) {
          title = body.substring(0, age_end);
          body = body.substring(age_end + 2, body.length());
          body = title + sep + body;
          trimmed = true;
        }
      }
      // --------------------

      org.jsoup.nodes.Document htmldoc =
          Jsoup.parseBodyFragment(body.replaceAll("COMMA_GOES_HERE", ","));
      Elements links = htmldoc.select("a[href]");
      Elements media = htmldoc.select("[src]");
      Elements imports = htmldoc.select("link[href]");

      processedText.add(htmldoc.text().replace(sep, " "));
      Document doc = Factory.newDocument(htmldoc.text());

      // put the document in the corpus
      corpus.add(doc);

      // run the application
      this.application.execute();

      // remove the document from the corpus again
      corpus.clear();

      // extract annotations
      String line = "";
      AnnotationSet Annots = doc.getAnnotations("");

      Integer FirstPersonCount = 0, ThirdPersonCount = 0;
      AnnotationSet FirstPerson = Annots.get("FirstPerson");
      if (FirstPerson != null) FirstPersonCount = FirstPerson.size();
      AnnotationSet ThirdPerson = Annots.get("ThirdPerson");
      if (ThirdPerson != null) ThirdPersonCount = ThirdPerson.size();
      line += FirstPersonCount.toString() + "," + ThirdPersonCount.toString() + ",";

      AnnotationSet Names = Annots.get("Name");
      if (Names == null || Names.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = Names.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("name");
          if (Feat != null) line += Feat.toString();
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet Age = Annots.get("Age");
      if (Age == null || Age.size() < 1) line += title_age + ",";
      else {
        Iterator<Annotation> Iter = Age.inDocumentOrder().iterator();
        line += title_age + ";";
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("age");
          if (Feat != null) line += Feat.toString();
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet Cost = Annots.get("Cost");
      if (Cost == null || Cost.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = Cost.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("value");
          if (Feat != null) line += Feat.toString();
          else line += "none";
          line += "/";
          Feat = Ann.getFeatures().get("target_value");
          if (Feat != null) line += Feat.toString();
          else line += "none";
          line += "/";
          Feat = Ann.getFeatures().get("target_type");
          if (Feat != null) line += Feat.toString();
          else line += "none";
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet height = Annots.get("height");
      if (height == null || height.size() < 1) line += ",,";
      else {
        String ft = "";
        String inch = "";
        Iterator<Annotation> Iter = height.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("feet");
          if (Feat != null) ft += Feat.toString();
          else ft += "none";
          Feat = Ann.getFeatures().get("inches");
          if (Feat != null) inch += Feat.toString();
          else inch += "none";
          if (Iter.hasNext()) {
            ft += ";";
            inch += ";";
          }
        }
        line += ft + "," + inch + ",";
      }

      AnnotationSet weight = Annots.get("weight");
      if (weight == null || weight.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = weight.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("pounds");
          if (Feat != null) line += Feat.toString();
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet measurement = Annots.get("measurement");
      if (measurement == null || measurement.size() < 1) line += ",,,,";
      else {
        String cup = "";
        String chest = "";
        String waist = "";
        String hip = "";
        Iterator<Annotation> Iter = measurement.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("cup");
          if (Feat != null) cup += Feat.toString();
          else cup += "none";
          Feat = Ann.getFeatures().get("chest");
          if (Feat != null) chest += Feat.toString();
          else chest += "none";
          Feat = Ann.getFeatures().get("waist");
          if (Feat != null) waist += Feat.toString();
          else waist += "none";
          Feat = Ann.getFeatures().get("hip");
          if (Feat != null) hip += Feat.toString();
          else hip += "none";
          if (Iter.hasNext()) {
            cup += ";";
            chest += ";";
            waist += ";";
            hip += ";";
          }
        }
        line += cup + "," + chest + "," + waist + "," + hip + ",";
      }

      AnnotationSet Ethnicity = Annots.get("Ethnicity");
      if (Ethnicity == null || Ethnicity.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = Ethnicity.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("ethnicity");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet SkinColor = Annots.get("SkinColor");
      if (SkinColor == null || SkinColor.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = SkinColor.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("color");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet EyeColor = Annots.get("EyeColor");
      if (EyeColor == null || EyeColor.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = EyeColor.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("color");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet HairColor = Annots.get("HairColor");
      if (HairColor == null || HairColor.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = HairColor.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("color");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet Restriction = Annots.get("Restriction");
      if (Restriction == null || Restriction.size() < 1) line += ",,,";
      else {
        String type = "";
        String ethnicity = "";
        String age = "";
        Iterator<Annotation> Iter = Restriction.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("type");
          if (Feat != null) type += Feat.toString();
          else type += "none";
          Feat = Ann.getFeatures().get("ethnicity");
          if (Feat != null)
            ethnicity += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          else ethnicity += "none";
          Feat = Ann.getFeatures().get("age");
          if (Feat != null) age += Feat.toString();
          else age += "none";
          if (Iter.hasNext()) {
            type += ";";
            ethnicity += ";";
            age += ";";
          }
        }
        line += type + "," + ethnicity + "," + age + ",";
      }

      AnnotationSet Phone = Annots.get("PhoneNumber");
      if (Phone == null || Phone.size() < 1) line += ",,,";
      else {
        String value = "";
        String state = "";
        String city = "";
        Iterator<Annotation> Iter = Phone.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("value");
          if (Feat != null) value += Feat.toString();
          else value += "none";
          Feat = Ann.getFeatures().get("state");
          if (Feat != null) state += Feat.toString();
          else state += "none";
          Feat = Ann.getFeatures().get("area");
          if (Feat != null)
            city += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          else city += "none";
          if (Iter.hasNext()) {
            value += ";";
            state += ";";
            city += ";";
          }
        }
        line += value + "," + state + "," + city + ",";
      }

      String Emails = "";
      AnnotationSet Email = Annots.get("Email");
      if (Email == null || Email.size() < 1) Emails = "";
      else {
        Iterator<Annotation> Iter = Email.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("email");
          if (Feat != null)
            Emails += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (links != null) {
        for (Element l : links) {
          String href = l.attr("abs:href");
          if (href == null) continue;
          if (href.length() > 7 && href.substring(0, 7).toLowerCase().equals("mailto:")) {
            Emails +=
                href.substring(7, href.length()).replaceAll(",", " ").replaceAll(";", " ") + ";";
          }
        }
      }
      if (Emails.length() > 0 && Emails.substring(Emails.length() - 1, Emails.length()).equals(";"))
        Emails = Emails.substring(0, Emails.length() - 1);
      line += Emails + ",";

      String Urls = "";
      AnnotationSet Url = Annots.get("Url");
      if (Url == null || Url.size() < 1) Urls = "";
      else {
        Iterator<Annotation> Iter = Url.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("url");
          if (Feat != null)
            Urls += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (links != null) {
        for (Element l : links) {
          String href = l.attr("abs:href");
          if (href == null) continue;
          if (href.length() <= 7 || !href.substring(0, 7).toLowerCase().equals("mailto:")) {
            Urls += href.replaceAll(",", " ").replaceAll(";", " ") + ";";
          }
        }
      }
      if (imports != null) {
        for (Element l : imports) {
          String href = l.attr("abs:href");
          if (href == null) continue;
          Urls += href.replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (Urls.length() > 0 && Urls.substring(Urls.length() - 1, Urls.length()).equals(";"))
        Urls = Urls.substring(0, Urls.length() - 1);
      line += Urls + ",";

      String Medias = "";
      if (media != null) {
        for (Element l : media) {
          String src = l.attr("abs:src");
          if (src == null) continue;
          Medias += src.replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (Medias.length() > 0 && Medias.substring(Medias.length() - 1, Medias.length()).equals(";"))
        Medias = Medias.substring(0, Medias.length() - 1);
      line += Medias;

      processedlines.add(line);
      // Release the document, as it is no longer needed
      Factory.deleteResource(doc);
    }
    Factory.deleteResource(corpus);

    RetObj out = new RetObj(processedlines, processedText);
    return out;
  }
Beispiel #17
0
  public static void main(String[] args) throws Exception {

    GateUtils.initGate();

    ie =
        new SpcInteractionExport(
            "SpcInteractionExport/SpcInteractionExport_" + ProjectSetup.makeTimeStamp() + ".csv");

    /**/

    initController();

    /**/

    CsvReader in =
        new CsvReader("C:/Users/dedek/Desktop/DATLOWE/LP_SPC.csv", ';', Charset.forName("cp1250"));
    in.readHeaders();

    StopRequestDetector srd = new StopRequestDetector();
    srd.startDetector();

    int num = 0;

    while (!srd.stop_requested) {
      if (!in.readRecord()) break;

      String spcCode = in.get(0);
      ie.setSpcCode(spcCode);

      String doc = in.get(3);
      String pdf = in.get(4);

      String fileStr = pdf;
      File file = null;

      if (!fileStr.isEmpty()) {
        file = new File("C:/Users/dedek/Desktop/DATLOWE/SPC_all/" + fileStr);
        if (!file.exists()) fileStr = "";
      }

      if (fileStr.isEmpty()) {
        fileStr = doc;
      }

      if (!fileStr.isEmpty()) {
        file = new File("C:/Users/dedek/Desktop/DATLOWE/SPC_all/" + fileStr);
        if (!file.exists()) fileStr = "";
      }

      System.err.format(
          "%4d %s record: %s %s %s\n",
          ++num, ProjectSetup.makeTimeStamp(), spcCode, in.get(1), fileStr);
      if (fileStr.isEmpty()) {
        System.err.format("WARNING ommiting record: %s %s\n", in.get(0), in.get(1));
        continue;
      }

      Document gateDoc = Factory.newDocument(file.toURI().toURL(), "utf8");
      corpus.add(gateDoc);

      try {
        controller.execute();
      } catch (ExecutionException e) {
        System.err.println("----------------------- EXECUTION INTERUPTED -------------------");
        e.printStackTrace();
        initController();
        System.err.println("----------------------- EXECUTION RESTARTED -------------------");
      }

      corpus.clear();
      // GateUtils.saveGateDocumentToXML(gateDoc,
      // "C:/Users/dedek/Desktop/DATLOWE/SPC_ananlyzed/"+spcCode+".xml");
      Factory.deleteResource(gateDoc);
    }

    in.close();
    ie.close();

    GateUtils.deleteAllPublicGateResources();
  }