コード例 #1
0
ファイル: HEntryExtractor.java プロジェクト: muse-tika/any23
 private void addLocations(HTMLDocument doc, Resource entry) throws ExtractionException {
   List<Node> nodes =
       doc.findAllByClassName(
           Microformats2Prefixes.PROPERTY_PREFIX
               + entryFields[11]
               + Microformats2Prefixes.SPACE_SEPARATOR
               + Microformats2Prefixes.CLASS_PREFIX
               + "geo");
   if (nodes.isEmpty()) return;
   for (Node node : nodes) {
     BNode location = valueFactory.createBNode();
     addURIProperty(location, RDF.TYPE, vEntry.location);
     HTMLDocument fragment = new HTMLDocument(node);
     for (String field : geoFields) {
       HTMLDocument.TextField[] values =
           fragment.getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + field);
       for (HTMLDocument.TextField val : values) {
         Node attribute = val.source().getAttributes().getNamedItem("title");
         if (attribute == null) {
           conditionallyAddStringProperty(
               val.source(), location, vVCARD.getProperty(field), val.value());
         } else {
           conditionallyAddStringProperty(
               val.source(), location, vVCARD.getProperty(field), attribute.getNodeValue());
         }
       }
     }
   }
 }
コード例 #2
0
ファイル: HEntryExtractor.java プロジェクト: muse-tika/any23
/**
 * Extractor for the <a href="http://microformats.org/wiki/h-entry">h-entry</a> microformat.
 *
 * @author Nisala Nirmana
 */
public class HEntryExtractor extends EntityBasedMicroformatExtractor {

  private static final HEntry vEntry = HEntry.getInstance();
  private static final VCard vVCARD = VCard.getInstance();

  private static final String[] entryFields = {
    "name",
    "summary",
    "content",
    "published",
    "updated",
    "category",
    "url",
    "uid",
    "syndication",
    "in-reply-to",
    "author",
    "location",
  };

  private static final String[] geoFields = {"latitude", "longitude", "altitude"};

  @Override
  public ExtractorDescription getDescription() {
    return HEntryExtractorFactory.getDescriptionInstance();
  }

  @Override
  protected String getBaseClassName() {
    return Microformats2Prefixes.CLASS_PREFIX + "entry";
  }

  @Override
  protected void resetExtractor() {
    // Empty.
  }

  @Override
  protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
    final BNode entry = getBlankNodeFor(node);
    conditionallyAddResourceProperty(entry, RDF.TYPE, vEntry.Entry);
    final HTMLDocument fragment = new HTMLDocument(node);
    addName(fragment, entry);
    addSummary(fragment, entry);
    addContent(fragment, entry);
    addPublished(fragment, entry);
    addUpdated(fragment, entry);
    addCategories(fragment, entry);
    addURLs(fragment, entry);
    addUID(fragment, entry);
    addSyndications(fragment, entry);
    addInReplyTo(fragment, entry);
    addLocations(fragment, entry);
    addAuthors(fragment, entry);
    return true;
  }

  private void addAuthors(HTMLDocument doc, Resource entry) throws ExtractionException {
    List<Node> nodes =
        doc.findAllByClassName(
            Microformats2Prefixes.PROPERTY_PREFIX
                + entryFields[10]
                + Microformats2Prefixes.SPACE_SEPARATOR
                + Microformats2Prefixes.CLASS_PREFIX
                + "card");
    if (nodes.isEmpty()) return;
    HCardExtractorFactory factory = new HCardExtractorFactory();
    HCardExtractor extractor = factory.createExtractor();
    for (Node node : nodes) {
      BNode author = valueFactory.createBNode();
      addURIProperty(author, RDF.TYPE, vEntry.author);
      extractor.extractEntityAsEmbeddedProperty(
          new HTMLDocument(node), author, getCurrentExtractionResult());
    }
  }

  private void mapFieldWithProperty(
      HTMLDocument fragment, BNode entry, String fieldClass, URI property) {
    HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
    conditionallyAddStringProperty(title.source(), entry, property, title.value());
  }

  private void addName(HTMLDocument fragment, BNode entry) {
    mapFieldWithProperty(
        fragment, entry, Microformats2Prefixes.PROPERTY_PREFIX + entryFields[0], vEntry.name);
  }

  private void addSummary(HTMLDocument fragment, BNode entry) {
    mapFieldWithProperty(
        fragment, entry, Microformats2Prefixes.PROPERTY_PREFIX + entryFields[1], vEntry.summary);
  }

  private void addContent(HTMLDocument fragment, BNode entry) {
    mapFieldWithProperty(
        fragment,
        entry,
        Microformats2Prefixes.EMBEDDED_PROPERTY_PREFIX + entryFields[2],
        vEntry.content);
  }

  private void addPublished(HTMLDocument fragment, BNode entry) {
    final HTMLDocument.TextField[] durations =
        fragment.getPluralTextField(Microformats2Prefixes.TIME_PROPERTY_PREFIX + entryFields[3]);
    for (HTMLDocument.TextField duration : durations) {
      Node attribute = duration.source().getAttributes().getNamedItem("datetime");
      if (attribute == null) {
        conditionallyAddStringProperty(
            duration.source(), entry, vEntry.published, duration.value());
      } else {
        conditionallyAddStringProperty(
            duration.source(), entry, vEntry.published, attribute.getNodeValue());
      }
    }
  }

  private void addUpdated(HTMLDocument fragment, BNode entry) {
    final HTMLDocument.TextField[] durations =
        fragment.getPluralTextField(Microformats2Prefixes.TIME_PROPERTY_PREFIX + entryFields[4]);
    for (HTMLDocument.TextField duration : durations) {
      Node attribute = duration.source().getAttributes().getNamedItem("datetime");
      if (attribute == null) {
        conditionallyAddStringProperty(duration.source(), entry, vEntry.updated, duration.value());
      } else {
        conditionallyAddStringProperty(
            duration.source(), entry, vEntry.updated, attribute.getNodeValue());
      }
    }
  }

  private void addCategories(HTMLDocument fragment, BNode entry) {
    final HTMLDocument.TextField[] categories =
        fragment.getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + entryFields[5]);
    for (HTMLDocument.TextField category : categories) {
      conditionallyAddStringProperty(category.source(), entry, vEntry.category, category.value());
    }
  }

  private void addURLs(HTMLDocument fragment, BNode entry) throws ExtractionException {
    final HTMLDocument.TextField[] urls =
        fragment.getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[6]);
    for (HTMLDocument.TextField url : urls) {
      addURIProperty(entry, vEntry.url, fragment.resolveURI(url.value()));
    }
  }

  private void addUID(HTMLDocument fragment, BNode entry) throws ExtractionException {
    final HTMLDocument.TextField uid =
        fragment.getSingularTextField(Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[7]);
    if (uid.source() == null) return;
    addURIProperty(entry, vEntry.uid, fragment.resolveURI(uid.value()));
  }

  private void addSyndications(HTMLDocument fragment, BNode entry) throws ExtractionException {
    final HTMLDocument.TextField[] syndications =
        fragment.getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[8]);
    for (HTMLDocument.TextField syndication : syndications) {
      addURIProperty(entry, vEntry.syndication, fragment.resolveURI(syndication.value()));
    }
  }

  private void addInReplyTo(HTMLDocument fragment, BNode entry) throws ExtractionException {
    final HTMLDocument.TextField inReplyTo =
        fragment.getSingularTextField(Microformats2Prefixes.URL_PROPERTY_PREFIX + entryFields[9]);
    if (inReplyTo.source() == null) return;
    addURIProperty(entry, vEntry.in_reply_to, fragment.resolveURI(inReplyTo.value()));
  }

  private void addLocations(HTMLDocument doc, Resource entry) throws ExtractionException {
    List<Node> nodes =
        doc.findAllByClassName(
            Microformats2Prefixes.PROPERTY_PREFIX
                + entryFields[11]
                + Microformats2Prefixes.SPACE_SEPARATOR
                + Microformats2Prefixes.CLASS_PREFIX
                + "geo");
    if (nodes.isEmpty()) return;
    for (Node node : nodes) {
      BNode location = valueFactory.createBNode();
      addURIProperty(location, RDF.TYPE, vEntry.location);
      HTMLDocument fragment = new HTMLDocument(node);
      for (String field : geoFields) {
        HTMLDocument.TextField[] values =
            fragment.getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + field);
        for (HTMLDocument.TextField val : values) {
          Node attribute = val.source().getAttributes().getNamedItem("title");
          if (attribute == null) {
            conditionallyAddStringProperty(
                val.source(), location, vVCARD.getProperty(field), val.value());
          } else {
            conditionallyAddStringProperty(
                val.source(), location, vVCARD.getProperty(field), attribute.getNodeValue());
          }
        }
      }
    }
  }
}