Java XHTMLContentHandlerの例、org.apache.tika.sax.XHTMLContentHandler Javaの例

コード例 #1

0

ファイルを表示

ファイル: AbstractPDF2XHTML.java プロジェクト: Zarana-Parekh/tika

  private void addFieldString(PDField field) throws SAXException {
    // Pick partial name to present in content and altName for attribute
    // Ignoring FullyQualifiedName for now
    String partName = field.getPartialName();
    String altName = field.getAlternateFieldName();

    StringBuilder sb = new StringBuilder();
    AttributesImpl attrs = new AttributesImpl();

    if (partName != null) {
      sb.append(partName).append(": ");
    }
    if (altName != null) {
      attrs.addAttribute("", "altName", "altName", "CDATA", altName);
    }
    // return early if PDSignature field
    if (field instanceof PDSignatureField) {
      handleSignature(attrs, (PDSignatureField) field);
      return;
    }
    String value = field.getValueAsString();
    if (value != null && !value.equals("null")) {
      sb.append(value);
    }

    if (attrs.getLength() > 0 || sb.length() > 0) {
      xhtml.startElement("li", attrs);
      xhtml.characters(sb.toString());
      xhtml.endElement("li");
    }
  }

コード例 #2

0

ファイルを表示

ファイル: DBFParser.java プロジェクト: Zarana-Parekh/tika

 private void writeRow(DBFRow row, Charset charset, XHTMLContentHandler xhtml)
     throws SAXException {
   xhtml.startElement("tr");
   for (DBFCell cell : row.cells) {
     xhtml.startElement("td");
     xhtml.characters(cell.getString(charset));
     xhtml.endElement("td");
   }
   xhtml.endElement("tr");
 }

コード例 #3

0

ファイルを表示

ファイル: DBFParser.java プロジェクト: Zarana-Parekh/tika

  @Override
  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    DBFReader reader = DBFReader.open(stream);
    DBFFileHeader header = reader.getHeader();
    metadata.set(Metadata.CONTENT_TYPE, header.getVersion().getFullMimeString());

    // insert metadata here
    Calendar lastModified = header.getLastModified();
    if (lastModified != null) {
      metadata.set(TikaCoreProperties.MODIFIED, lastModified);
    }

    // buffer first X rows for charset detection
    List<DBFRow> firstRows = new LinkedList<>();
    DBFRow row = reader.next();
    int i = 0;
    while (row != null && i++ < ROWS_TO_BUFFER_FOR_CHARSET_DETECTION) {
      firstRows.add(row.deepCopy());
      row = reader.next();
    }

    Charset charset = getCharset(firstRows, header);
    metadata.set(Metadata.CONTENT_ENCODING, charset.toString());

    XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.startElement("table");
    xhtml.startElement("thead");
    for (DBFColumnHeader col : header.getCols()) {
      xhtml.startElement("th");
      xhtml.characters(col.getName(charset));
      xhtml.endElement("th");
    }
    xhtml.endElement("thead");

    xhtml.startElement("tbody");

    // now write cached rows
    while (firstRows.size() > 0) {
      DBFRow cachedRow = firstRows.remove(0);
      writeRow(cachedRow, charset, xhtml);
    }

    // now continue with rest
    while (row != null) {
      writeRow(row, charset, xhtml);
      row = reader.next();
    }
    xhtml.endElement("tbody");
    xhtml.endElement("table");
    xhtml.endDocument();
  }

コード例 #4

0

ファイルを表示

ファイル: SiteMapTikaParser.java プロジェクト: thaingo/ptd

  @Override
  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    if (metadata.get(Metadata.CONTENT_TYPE) == null) {
      metadata.set(Metadata.CONTENT_TYPE, "application/xml");
    }

    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();
    xhtml.startElement("p");

    xhtml.endElement("p");
    xhtml.endDocument();
  }

コード例 #5

0

ファイルを表示

ファイル: AbstractPDF2XHTML.java プロジェクト: Zarana-Parekh/tika

 void extractBookmarkText(PDOutlineNode bookmark) throws SAXException {
   PDOutlineItem current = bookmark.getFirstChild();
   if (current != null) {
     xhtml.startElement("ul");
     while (current != null) {
       xhtml.startElement("li");
       xhtml.characters(current.getTitle());
       xhtml.endElement("li");
       // Recurse:
       extractBookmarkText(current);
       current = current.getNextSibling();
     }
     xhtml.endElement("ul");
   }
 }

コード例 #6

0

ファイルを表示

ファイル: AbstractPDF2XHTML.java プロジェクト: Zarana-Parekh/tika

  void extractAcroForm(PDDocument pdf) throws IOException, SAXException {
    // Thank you, Ben Litchfield, for org.apache.pdfbox.examples.fdf.PrintFields
    // this code derives from Ben's code
    PDDocumentCatalog catalog = pdf.getDocumentCatalog();

    if (catalog == null) return;

    PDAcroForm form = catalog.getAcroForm();
    if (form == null) return;

    // if it has xfa, try that.
    // if it doesn't exist or there's an exception,
    // go with traditional AcroForm
    PDXFAResource pdxfa = form.getXFA();

    if (pdxfa != null) {
      // if successful, return
      XFAExtractor xfaExtractor = new XFAExtractor();
      try (InputStream is = new BufferedInputStream(new ByteArrayInputStream(pdxfa.getBytes()))) {
        xfaExtractor.extract(is, xhtml, metadata, context);
        return;
      } catch (XMLStreamException | IOException e) {
        // if there was an xml parse exception in xfa, try the AcroForm
      }
    }

    @SuppressWarnings("rawtypes")
    List fields = form.getFields();

    if (fields == null) return;

    @SuppressWarnings("rawtypes")
    ListIterator itr = fields.listIterator();

    if (itr == null) return;

    xhtml.startElement("div", "class", "acroform");
    xhtml.startElement("ol");

    while (itr.hasNext()) {
      Object obj = itr.next();
      if (obj != null && obj instanceof PDField) {
        processAcroField((PDField) obj, 0);
      }
    }
    xhtml.endElement("ol");
    xhtml.endElement("div");
  }

コード例 #7

0

ファイルを表示

ファイル: AbstractPDF2XHTML.java プロジェクト: Zarana-Parekh/tika

  private void processAcroField(PDField field, final int currentRecursiveDepth)
      throws SAXException, IOException {

    if (currentRecursiveDepth >= MAX_ACROFORM_RECURSIONS) {
      return;
    }
    addFieldString(field);
    if (field instanceof PDNonTerminalField) {
      int r = currentRecursiveDepth + 1;
      xhtml.startElement("ol");
      for (PDField child : ((PDNonTerminalField) field).getChildren()) {
        processAcroField(child, r);
      }
      xhtml.endElement("ol");
    }
  }

コード例 #8

0

ファイルを表示

ファイル: AbstractPDF2XHTML.java プロジェクト: Zarana-Parekh/tika

  @Override
  protected void endDocument(PDDocument pdf) throws IOException {
    try {
      // Extract text for any bookmarks:
      extractBookmarkText();
      try {
        extractEmbeddedDocuments(pdf);
      } catch (IOException e) {
        handleCatchableIOE(e);
      }

      // extract acroform data at end of doc
      if (config.getExtractAcroFormContent() == true) {
        try {
          extractAcroForm(pdf);
        } catch (IOException e) {
          handleCatchableIOE(e);
        }
      }
      xhtml.endDocument();
    } catch (TikaException e) {
      throw new IOExceptionWithCause("Unable to end a document", e);
    } catch (SAXException e) {
      throw new IOExceptionWithCause("Unable to end a document", e);
    }
  }

コード例 #9

0

ファイルを表示

ファイル: AbstractPDF2XHTML.java プロジェクト: Zarana-Parekh/tika

 @Override
 protected void startDocument(PDDocument pdf) throws IOException {
   try {
     xhtml.startDocument();
   } catch (SAXException e) {
     throw new IOExceptionWithCause("Unable to start a document", e);
   }
 }

コード例 #10

0

ファイルを表示

ファイル: AbstractPDF2XHTML.java プロジェクト: Zarana-Parekh/tika

 @Override
 protected void startPage(PDPage page) throws IOException {
   try {
     xhtml.startElement("div", "class", "page");
   } catch (SAXException e) {
     throw new IOExceptionWithCause("Unable to start a page", e);
   }
   writeParagraphStart();
 }

コード例 #11

0

ファイルを表示

ファイル: AbstractPDF2XHTML.java プロジェクト: Zarana-Parekh/tika

  private void extractPDEmbeddedFile(
      String displayName,
      String unicodeFileName,
      String fileName,
      PDEmbeddedFile file,
      EmbeddedDocumentExtractor extractor)
      throws SAXException, IOException, TikaException {

    if (file == null) {
      // skip silently
      return;
    }

    fileName = (fileName == null) ? displayName : fileName;

    // TODO: other metadata?
    Metadata metadata = new Metadata();
    metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);
    metadata.set(Metadata.CONTENT_TYPE, file.getSubtype());
    metadata.set(Metadata.CONTENT_LENGTH, Long.toString(file.getSize()));
    metadata.set(
        TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
        TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.toString());
    metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, fileName);

    if (extractor.shouldParseEmbedded(metadata)) {
      TikaInputStream stream = null;
      try {
        stream = TikaInputStream.get(file.createInputStream());
        extractor.parseEmbedded(stream, new EmbeddedContentHandler(xhtml), metadata, false);

        AttributesImpl attributes = new AttributesImpl();
        attributes.addAttribute("", "class", "class", "CDATA", "embedded");
        attributes.addAttribute("", "id", "id", "CDATA", fileName);
        xhtml.startElement("div", attributes);
        xhtml.endElement("div");
      } finally {
        IOUtils.closeQuietly(stream);
      }
    }
  }

コード例 #12

0

ファイルを表示

ファイル: AbstractXML2003Parser.java プロジェクト: xexes/tika

  @Override
  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    setContentType(metadata);

    final XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
    xhtml.startDocument();

    TaggedContentHandler tagged = new TaggedContentHandler(xhtml);
    try {
      context
          .getSAXParser()
          .parse(
              new CloseShieldInputStream(stream),
              new OfflineContentHandler(
                  new EmbeddedContentHandler(getContentHandler(tagged, metadata, context))));
    } catch (SAXException e) {
      tagged.throwIfCauseOf(e);
      throw new TikaException("XML parse error", e);
    } finally {
      xhtml.endDocument();
    }
  }

コード例 #13

0

ファイルを表示

ファイル: AbstractPDF2XHTML.java プロジェクト: Zarana-Parekh/tika

  private void handleSignature(AttributesImpl parentAttributes, PDSignatureField sigField)
      throws SAXException {

    PDSignature sig = sigField.getSignature();
    if (sig == null) {
      return;
    }
    Map<String, String> vals = new TreeMap<>();
    vals.put("name", sig.getName());
    vals.put("contactInfo", sig.getContactInfo());
    vals.put("location", sig.getLocation());
    vals.put("reason", sig.getReason());

    Calendar cal = sig.getSignDate();
    if (cal != null) {
      dateFormat.setTimeZone(cal.getTimeZone());
      vals.put("date", dateFormat.format(cal.getTime()));
    }
    // see if there is any data
    int nonNull = 0;
    for (String val : vals.keySet()) {
      if (val != null && !val.equals("")) {
        nonNull++;
      }
    }
    // if there is, process it
    if (nonNull > 0) {
      xhtml.startElement("li", parentAttributes);

      AttributesImpl attrs = new AttributesImpl();
      attrs.addAttribute("", "type", "type", "CDATA", "signaturedata");

      xhtml.startElement("ol", attrs);
      for (Map.Entry<String, String> e : vals.entrySet()) {
        if (e.getValue() == null || e.getValue().equals("")) {
          continue;
        }
        attrs = new AttributesImpl();
        attrs.addAttribute("", "signdata", "signdata", "CDATA", e.getKey());
        xhtml.startElement("li", attrs);
        xhtml.characters(e.getValue());
        xhtml.endElement("li");
      }
      xhtml.endElement("ol");
      xhtml.endElement("li");
    }
  }

コード例 #14

0

ファイルを表示

ファイル: AbstractDBParser.java プロジェクト: asitang/tika_pdf_celgene

  @Override
  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    connection = getConnection(stream, metadata, context);
    XHTMLContentHandler xHandler = null;
    List<String> tableNames = null;
    try {
      tableNames = getTableNames(connection, metadata, context);
    } catch (SQLException e) {
      throw new IOExceptionWithCause(e);
    }
    for (String tableName : tableNames) {
      // add table names to parent metadata
      metadata.add(Database.TABLE_NAME, tableName);
    }
    xHandler = new XHTMLContentHandler(handler, metadata);
    xHandler.startDocument();

    try {
      for (String tableName : tableNames) {
        JDBCTableReader tableReader = getTableReader(connection, tableName, context);
        xHandler.startElement("table", "name", tableReader.getTableName());
        xHandler.startElement("thead");
        xHandler.startElement("tr");
        for (String header : tableReader.getHeaders()) {
          xHandler.startElement("th");
          xHandler.characters(header);
          xHandler.endElement("th");
        }
        xHandler.endElement("tr");
        xHandler.endElement("thead");
        xHandler.startElement("tbody");
        while (tableReader.nextRow(xHandler, context)) {
          // no-op
        }
        xHandler.endElement("tbody");
        xHandler.endElement("table");
      }
    } finally {
      if (xHandler != null) {
        xHandler.endDocument();
      }
      try {
        close();
      } catch (SQLException e) {
        // swallow
      }
    }
  }

コード例 #15

0

ファイルを表示

ファイル: AbstractPDF2XHTML.java プロジェクト: Zarana-Parekh/tika

  @Override
  protected void endPage(PDPage page) throws IOException {

    try {
      EmbeddedDocumentExtractor extractor = getEmbeddedDocumentExtractor();
      for (PDAnnotation annotation : page.getAnnotations()) {

        if (annotation instanceof PDAnnotationFileAttachment) {
          PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;
          PDComplexFileSpecification fileSpec = (PDComplexFileSpecification) fann.getFile();
          try {
            extractMultiOSPDEmbeddedFiles(fann.getAttachmentName(), fileSpec, extractor);
          } catch (SAXException e) {
            throw new IOExceptionWithCause("file embedded in annotation sax exception", e);
          } catch (TikaException e) {
            throw new IOExceptionWithCause("file embedded in annotation tika exception", e);
          } catch (IOException e) {
            handleCatchableIOE(e);
          }
        }
        // TODO: remove once PDFBOX-1143 is fixed:
        if (config.getExtractAnnotationText()) {
          if (annotation instanceof PDAnnotationLink) {
            PDAnnotationLink annotationlink = (PDAnnotationLink) annotation;
            if (annotationlink.getAction() != null) {
              PDAction action = annotationlink.getAction();
              if (action instanceof PDActionURI) {
                // can't currently associate link to text.
                // for now, extract link and repeat the link as if it
                // were the visible text
                PDActionURI uri = (PDActionURI) action;
                String link = uri.getURI();
                if (link != null && link.trim().length() > 0) {
                  xhtml.startElement("div", "class", "annotation");
                  xhtml.startElement("a", "href", link);
                  xhtml.characters(link);
                  xhtml.endElement("a");
                  xhtml.endElement("div");
                }
              }
            }
          }

          if (annotation instanceof PDAnnotationMarkup) {
            PDAnnotationMarkup annotationMarkup = (PDAnnotationMarkup) annotation;
            String title = annotationMarkup.getTitlePopup();
            String subject = annotationMarkup.getSubject();
            String contents = annotationMarkup.getContents();
            // TODO: maybe also annotationMarkup.getRichContents()?
            if (title != null || subject != null || contents != null) {
              xhtml.startElement("div", "class", "annotation");

              if (title != null) {
                xhtml.startElement("div", "class", "annotationTitle");
                xhtml.characters(title);
                xhtml.endElement("div");
              }

              if (subject != null) {
                xhtml.startElement("div", "class", "annotationSubject");
                xhtml.characters(subject);
                xhtml.endElement("div");
              }

              if (contents != null) {
                xhtml.startElement("div", "class", "annotationContents");
                xhtml.characters(contents);
                xhtml.endElement("div");
              }

              xhtml.endElement("div");
            }
          }
        }
      }
      if (config.getOCRStrategy().equals(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION)) {
        doOCROnCurrentPage();
      }
      xhtml.endElement("div");
    } catch (SAXException | TikaException e) {
      throw new IOExceptionWithCause("Unable to end a page", e);
    } catch (IOException e) {
      exceptions.add(e);
    } finally {
      pageIndex++;
    }
  }