Esempio n. 1
0
  /**
   * Two AnnotationSet are equal if their name, the documents of which belong to the AnnotationSets
   * and annotations from the sets are the same
   */
  public static boolean annotationSetsEqual(AnnotationSet as1, AnnotationSet as2) {
    if (as1 == null ^ as2 == null) return false;
    if (as1 == null) return true;
    // Sets equality
    if (as1.size() != as2.size()) return false;
    try {
      if (!as1.containsAll(as2)) return false;
    } catch (ClassCastException unused) {
      return false;
    } catch (NullPointerException unused) {
      return false;
    }

    // removed to prevent infinite looping in testDocumentsEqual()
    //    // verify the documents which they belong to
    //    if (! check (as1.getDocument(), as2.getDocument())) return false;

    // verify the name of the AnnotationSets
    if (!check(as1.getName(), as2.getName())) return false;
    return true;
  } // equals
  private RetObj ProcessRecords() throws Exception {
    // Create a Corpus to use.  We recycle the same Corpus object for each
    // iteration.
    Corpus corpus = Factory.newCorpus("BatchProcessApp Corpus");
    this.application.setCorpus(corpus);

    // object for returned data
    List<String> processedlines = new ArrayList<String>();
    List<String> processedText = new ArrayList<String>();

    for (int record_num = 0; record_num < this.recs.size(); ++record_num) {
      /*if( record_num % Math.ceil(((double) this.recs.size())/10.0) == 0)
           System.out.println("Thread " + this.threadID + ": "+ ((int) ((double)record_num)/((double) this.recs.size())*100.0 ) +"% complete.");
      */

      // first, split title from body and get embedded age in title..
      String title_age = "-1";
      String sep = "..THIS IS MY SEPARATION STRING..";
      String title = "";
      String body = this.recs.get(record_num);
      Boolean trimmed = false;
      int age_end = body.indexOf(",>           ");
      if (age_end >= 0 && age_end < body.length()) {
        int age_start = body.lastIndexOf("-", age_end);
        if (age_start >= 0 && age_start < age_end) {
          title_age = body.substring(age_start + 1, age_end).trim();
          if (!isInteger(title_age)) title_age = "-1";
          else {
            title = body.substring(0, age_start);
            body = body.substring(age_end + 2, body.length());
            body = title + sep + body;
            trimmed = true;
          }
        }
        if (!trimmed) {
          title = body.substring(0, age_end);
          body = body.substring(age_end + 2, body.length());
          body = title + sep + body;
          trimmed = true;
        }
      }
      // --------------------

      org.jsoup.nodes.Document htmldoc =
          Jsoup.parseBodyFragment(body.replaceAll("COMMA_GOES_HERE", ","));
      Elements links = htmldoc.select("a[href]");
      Elements media = htmldoc.select("[src]");
      Elements imports = htmldoc.select("link[href]");

      processedText.add(htmldoc.text().replace(sep, " "));
      Document doc = Factory.newDocument(htmldoc.text());

      // put the document in the corpus
      corpus.add(doc);

      // run the application
      this.application.execute();

      // remove the document from the corpus again
      corpus.clear();

      // extract annotations
      String line = "";
      AnnotationSet Annots = doc.getAnnotations("");

      Integer FirstPersonCount = 0, ThirdPersonCount = 0;
      AnnotationSet FirstPerson = Annots.get("FirstPerson");
      if (FirstPerson != null) FirstPersonCount = FirstPerson.size();
      AnnotationSet ThirdPerson = Annots.get("ThirdPerson");
      if (ThirdPerson != null) ThirdPersonCount = ThirdPerson.size();
      line += FirstPersonCount.toString() + "," + ThirdPersonCount.toString() + ",";

      AnnotationSet Names = Annots.get("Name");
      if (Names == null || Names.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = Names.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("name");
          if (Feat != null) line += Feat.toString();
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet Age = Annots.get("Age");
      if (Age == null || Age.size() < 1) line += title_age + ",";
      else {
        Iterator<Annotation> Iter = Age.inDocumentOrder().iterator();
        line += title_age + ";";
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("age");
          if (Feat != null) line += Feat.toString();
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet Cost = Annots.get("Cost");
      if (Cost == null || Cost.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = Cost.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("value");
          if (Feat != null) line += Feat.toString();
          else line += "none";
          line += "/";
          Feat = Ann.getFeatures().get("target_value");
          if (Feat != null) line += Feat.toString();
          else line += "none";
          line += "/";
          Feat = Ann.getFeatures().get("target_type");
          if (Feat != null) line += Feat.toString();
          else line += "none";
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet height = Annots.get("height");
      if (height == null || height.size() < 1) line += ",,";
      else {
        String ft = "";
        String inch = "";
        Iterator<Annotation> Iter = height.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("feet");
          if (Feat != null) ft += Feat.toString();
          else ft += "none";
          Feat = Ann.getFeatures().get("inches");
          if (Feat != null) inch += Feat.toString();
          else inch += "none";
          if (Iter.hasNext()) {
            ft += ";";
            inch += ";";
          }
        }
        line += ft + "," + inch + ",";
      }

      AnnotationSet weight = Annots.get("weight");
      if (weight == null || weight.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = weight.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("pounds");
          if (Feat != null) line += Feat.toString();
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet measurement = Annots.get("measurement");
      if (measurement == null || measurement.size() < 1) line += ",,,,";
      else {
        String cup = "";
        String chest = "";
        String waist = "";
        String hip = "";
        Iterator<Annotation> Iter = measurement.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("cup");
          if (Feat != null) cup += Feat.toString();
          else cup += "none";
          Feat = Ann.getFeatures().get("chest");
          if (Feat != null) chest += Feat.toString();
          else chest += "none";
          Feat = Ann.getFeatures().get("waist");
          if (Feat != null) waist += Feat.toString();
          else waist += "none";
          Feat = Ann.getFeatures().get("hip");
          if (Feat != null) hip += Feat.toString();
          else hip += "none";
          if (Iter.hasNext()) {
            cup += ";";
            chest += ";";
            waist += ";";
            hip += ";";
          }
        }
        line += cup + "," + chest + "," + waist + "," + hip + ",";
      }

      AnnotationSet Ethnicity = Annots.get("Ethnicity");
      if (Ethnicity == null || Ethnicity.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = Ethnicity.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("ethnicity");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet SkinColor = Annots.get("SkinColor");
      if (SkinColor == null || SkinColor.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = SkinColor.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("color");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet EyeColor = Annots.get("EyeColor");
      if (EyeColor == null || EyeColor.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = EyeColor.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("color");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet HairColor = Annots.get("HairColor");
      if (HairColor == null || HairColor.size() < 1) line += ",";
      else {
        Iterator<Annotation> Iter = HairColor.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("color");
          if (Feat != null)
            line += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          if (Iter.hasNext()) line += ";";
        }
        line += ",";
      }

      AnnotationSet Restriction = Annots.get("Restriction");
      if (Restriction == null || Restriction.size() < 1) line += ",,,";
      else {
        String type = "";
        String ethnicity = "";
        String age = "";
        Iterator<Annotation> Iter = Restriction.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("type");
          if (Feat != null) type += Feat.toString();
          else type += "none";
          Feat = Ann.getFeatures().get("ethnicity");
          if (Feat != null)
            ethnicity += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          else ethnicity += "none";
          Feat = Ann.getFeatures().get("age");
          if (Feat != null) age += Feat.toString();
          else age += "none";
          if (Iter.hasNext()) {
            type += ";";
            ethnicity += ";";
            age += ";";
          }
        }
        line += type + "," + ethnicity + "," + age + ",";
      }

      AnnotationSet Phone = Annots.get("PhoneNumber");
      if (Phone == null || Phone.size() < 1) line += ",,,";
      else {
        String value = "";
        String state = "";
        String city = "";
        Iterator<Annotation> Iter = Phone.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("value");
          if (Feat != null) value += Feat.toString();
          else value += "none";
          Feat = Ann.getFeatures().get("state");
          if (Feat != null) state += Feat.toString();
          else state += "none";
          Feat = Ann.getFeatures().get("area");
          if (Feat != null)
            city += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ");
          else city += "none";
          if (Iter.hasNext()) {
            value += ";";
            state += ";";
            city += ";";
          }
        }
        line += value + "," + state + "," + city + ",";
      }

      String Emails = "";
      AnnotationSet Email = Annots.get("Email");
      if (Email == null || Email.size() < 1) Emails = "";
      else {
        Iterator<Annotation> Iter = Email.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("email");
          if (Feat != null)
            Emails += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (links != null) {
        for (Element l : links) {
          String href = l.attr("abs:href");
          if (href == null) continue;
          if (href.length() > 7 && href.substring(0, 7).toLowerCase().equals("mailto:")) {
            Emails +=
                href.substring(7, href.length()).replaceAll(",", " ").replaceAll(";", " ") + ";";
          }
        }
      }
      if (Emails.length() > 0 && Emails.substring(Emails.length() - 1, Emails.length()).equals(";"))
        Emails = Emails.substring(0, Emails.length() - 1);
      line += Emails + ",";

      String Urls = "";
      AnnotationSet Url = Annots.get("Url");
      if (Url == null || Url.size() < 1) Urls = "";
      else {
        Iterator<Annotation> Iter = Url.inDocumentOrder().iterator();
        while (Iter.hasNext()) {
          Annotation Ann = Iter.next();
          Object Feat = Ann.getFeatures().get("url");
          if (Feat != null)
            Urls += Feat.toString().toLowerCase().replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (links != null) {
        for (Element l : links) {
          String href = l.attr("abs:href");
          if (href == null) continue;
          if (href.length() <= 7 || !href.substring(0, 7).toLowerCase().equals("mailto:")) {
            Urls += href.replaceAll(",", " ").replaceAll(";", " ") + ";";
          }
        }
      }
      if (imports != null) {
        for (Element l : imports) {
          String href = l.attr("abs:href");
          if (href == null) continue;
          Urls += href.replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (Urls.length() > 0 && Urls.substring(Urls.length() - 1, Urls.length()).equals(";"))
        Urls = Urls.substring(0, Urls.length() - 1);
      line += Urls + ",";

      String Medias = "";
      if (media != null) {
        for (Element l : media) {
          String src = l.attr("abs:src");
          if (src == null) continue;
          Medias += src.replaceAll(",", " ").replaceAll(";", " ") + ";";
        }
      }
      if (Medias.length() > 0 && Medias.substring(Medias.length() - 1, Medias.length()).equals(";"))
        Medias = Medias.substring(0, Medias.length() - 1);
      line += Medias;

      processedlines.add(line);
      // Release the document, as it is no longer needed
      Factory.deleteResource(doc);
    }
    Factory.deleteResource(corpus);

    RetObj out = new RetObj(processedlines, processedText);
    return out;
  }
Esempio n. 3
0
  @Override
  public void execute() throws ExecutionException {
    AnnotationSet inputAS = document.getAnnotations(inputASName);
    AnnotationSet outputAS = document.getAnnotations(outputASName);
    AnnotationSet tagAS = document.getAnnotations(tagASName);
    AnnotationSet annotsToTransfer = null;

    boolean newID = copyAnnotations && inputAS.equals(outputAS);

    mappings.clear();

    // TODO clean this up so we don't have to repeat ourselves
    if (configURL != null) {

      BufferedReader in = null;
      try {
        in = new BomStrippingInputStreamReader(configURL.openStream());

        String line = in.readLine();
        while (line != null) {
          if (!line.trim().equals("")) {
            String[] data = line.split("=", 2);
            String oldName = data[0].trim();
            String newName = data.length == 2 ? data[1].trim() : null;
            mappings.put(oldName, new Mapping(oldName, newName));
          }
          line = in.readLine();
        }
      } catch (IOException ioe) {
        ioe.printStackTrace();
      } finally {
        IOUtils.closeQuietly(in);
      }
    } else if (annotationTypes != null) {
      for (String type : annotationTypes) {
        String[] data = type.split("=", 2);
        String oldName = data[0].trim();
        String newName = data.length == 2 ? data[1].trim() : null;

        mappings.put(oldName, new Mapping(oldName, newName));
      }
    }
    // else
    // throw new
    // ExecutionException("The annotation list and URL cannot both be null");

    if (mappings.size() > 0) {
      annotsToTransfer = inputAS.get(mappings.keySet());
    } else {
      // transfer everything
      annotsToTransfer = inputAS.get();
    }
    // in case of no one annotation from some of annotationTypes
    if (annotsToTransfer == null || annotsToTransfer.size() == 0) return;
    // check if we have a BODY annotation
    // if not, just copy all
    if (textTagName == null || textTagName.equals("")) {
      // remove from input set unless we copy only
      if (!copyAnnotations) inputAS.removeAll(annotsToTransfer);
      transferAnnotations(new ArrayList<Annotation>(annotsToTransfer), outputAS, newID);

      return;
    }
    // get the BODY annotation
    bodyAnnotations = tagAS.get(textTagName);
    if (bodyAnnotations == null || bodyAnnotations.isEmpty()) {
      // outputAS.addAll(inputAS);
      if (transferAllUnlessFound) {
        // remove from input set unless we copy only
        if (!copyAnnotations) inputAS.removeAll(annotsToTransfer);
        transferAnnotations(new ArrayList<Annotation>(annotsToTransfer), outputAS, newID);
      }
      return;
    }
    List<Annotation> annots2Move = new ArrayList<Annotation>();
    Iterator<Annotation> bodyIter = bodyAnnotations.iterator();
    while (bodyIter.hasNext()) {
      Annotation bodyAnn = bodyIter.next();
      Long start = bodyAnn.getStartNode().getOffset();
      Long end = bodyAnn.getEndNode().getOffset();
      // get all annotations we want transferred
      AnnotationSet annots2Copy = annotsToTransfer.getContained(start, end);
      // copy them to the new set and delete them from the old one
      annots2Move.addAll(annots2Copy);
    }
    if (!copyAnnotations) inputAS.removeAll(annots2Move);
    transferAnnotations(annots2Move, outputAS, newID);
  }