Example #1
0
  public void parse(
      InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {

    // As we don't know which of the metadata or the content
    //  we'll hit first, catch the endDocument call initially
    EndDocumentShieldingContentHandler handler =
        new EndDocumentShieldingContentHandler(baseHandler);

    // Process the file in turn
    ZipInputStream zip = new ZipInputStream(stream);
    ZipEntry entry = zip.getNextEntry();
    while (entry != null) {
      if (entry.getName().equals("mimetype")) {
        String type = IOUtils.toString(zip, "UTF-8");
        metadata.set(Metadata.CONTENT_TYPE, type);
      } else if (entry.getName().equals("meta.xml")) {
        meta.parse(zip, new DefaultHandler(), metadata, context);
      } else if (entry.getName().endsWith("content.xml")) {
        content.parse(zip, handler, metadata, context);
      }
      entry = zip.getNextEntry();
    }

    // Only now call the end document
    if (handler.getEndDocumentWasCalled()) {
      handler.reallyEndDocument();
    }
  }
  private void scan(ByteArrayInputStream in, String path, SVNDirEntry dirEntry) {
    try {
      Metadata metadata = new Metadata();
      metadata.set(Metadata.RESOURCE_NAME_KEY, path);

      // The following code part is from an proposal of the Authors of
      // Tika:
      // https://issues.apache.org/jira/browse/TIKA-232
      TikaConfig config = TikaConfig.getDefaultConfig(); // without a
      // delegate
      // parser
      Parser parser = new AutoDetectParser(config);
      DefaultHandler handler = new BodyContentHandler();
      parser.parse(in, handler, metadata);
      getDocument().addTokenizedField(FieldNames.CONTENTS, handler.toString());

    } catch (Exception e) {
      LOGGER.error("We had an exception " + path + " (r" + dirEntry.getRevision() + ")", e);
    } finally {
      try {
        in.close();
      } catch (Exception e) {
        LOGGER.error("We had an exception " + path + " (r" + dirEntry.getRevision() + ")", e);
      }
    }
  }
Example #3
0
  @Test
  public void testRarParsing() throws Exception {
    Parser parser = new AutoDetectParser(); // Should auto-detect!
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();

    try (InputStream stream =
        RarParserTest.class.getResourceAsStream("/test-documents/test-documents.rar")) {
      parser.parse(stream, handler, metadata, recursingContext);
    }

    assertEquals("application/x-rar-compressed", metadata.get(Metadata.CONTENT_TYPE));
    String content = handler.toString();
    assertContains("test-documents/testEXCEL.xls", content);
    assertContains("Sample Excel Worksheet", content);
    assertContains("test-documents/testHTML.html", content);
    assertContains("Test Indexation Html", content);
    assertContains("test-documents/testOpenOffice2.odt", content);
    assertContains("This is a sample Open Office document", content);
    assertContains("test-documents/testPDF.pdf", content);
    assertContains("Apache Tika", content);
    assertContains("test-documents/testPPT.ppt", content);
    assertContains("Sample Powerpoint Slide", content);
    assertContains("test-documents/testRTF.rtf", content);
    assertContains("indexation Word", content);
    assertContains("test-documents/testTXT.txt", content);
    assertContains("Test d'indexation de Txt", content);
    assertContains("test-documents/testWORD.doc", content);
    assertContains("This is a sample Microsoft Word Document", content);
    assertContains("test-documents/testXML.xml", content);
    assertContains("Rida Benjelloun", content);
  }
  @Override
  protected void processSubDataEntity(
      MultiValueHashMap<String, Object> subDataEntityInformation,
      Metadata metadata,
      ContentHandler handler2use4recursiveCall,
      ParseContext context)
      throws Exception {

    URLName urlNameWithPassword =
        (URLName) subDataEntityInformation.getFirst("urlNameWithPassword");

    String strMessageId = (String) subDataEntityInformation.getFirst("Message-ID");
    String strMessageFolder = (String) subDataEntityInformation.getFirst("folder");

    String strEntityId = ImapURLStreamProvider.getEntityId(strMessageFolder, strMessageId);

    // wir setzten die hier schon mal - die Daten haben wir in einem prefetching-Schritt schon
    // effizient geladen. Wenn diese hier schon im
    // Metadata-Objekt stehen, werden sie von der addFirstMetadata nicht nochmal geladen
    metadata.set(Metadata.SOURCE, urlNameWithPassword.toString());
    metadata.set(IncrementalCrawlingHistory.dataEntityId, strEntityId);
    metadata.set(
        IncrementalCrawlingHistory.dataEntityContentFingerprint,
        ImapURLStreamProvider.getDataEntityContentFingerprint(strEntityId));
    URLName urlNameWithoutPassword =
        new URLName(
            urlNameWithPassword.getProtocol(),
            urlNameWithPassword.getHost(),
            urlNameWithPassword.getPort(),
            urlNameWithPassword.getFile(),
            urlNameWithPassword.getUsername(),
            "");
    metadata.set(Metadata.RESOURCE_NAME_KEY, urlNameWithoutPassword.toString());
    if (strMessageId == null)
      metadata.set("Content-Type", DatasourceMediaTypes.IMAPFOLDER.toString());
    else metadata.set("Content-Type", "message/rfc822");

    metadata =
        URLStreamProvider.getURLStreamProvider4Protocol(urlNameWithPassword.getProtocol())
            .addFirstMetadata(urlNameWithPassword, metadata, context);
    InputStream stream =
        URLStreamProvider.getURLStreamProvider(urlNameWithPassword)
            .getStream(urlNameWithPassword, metadata, context);

    try {

      if (m_leech == null) m_leech = new Leech();

      // hier nimmt der dann bei einer message hoffentlich den Tika RFC822Parser
      Parser parser = m_leech.getParser();

      parser.parse(stream, handler2use4recursiveCall, metadata, context);

    } finally {
      if (stream != null) stream.close();
    }
  }
  public void testJPEG() throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
    InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG.jpg");
    parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());

    assertEquals("100", metadata.get(TIFF.IMAGE_WIDTH));
    for (String name : metadata.names()) {
      logger.trace("JPEG-- " + name + "=" + metadata.get(name));
    }
  }
  public void testPNGIPTC() throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "image/png");
    InputStream stream = getClass().getResourceAsStream("/test-documents/testPNG_IPTC.png");
    parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());

    for (String name : metadata.names()) {
      logger.trace("PNG-- " + name + "=" + metadata.get(name));
    }
    assertEquals("100", metadata.get(TIFF.IMAGE_WIDTH));
    assertEquals("Cat in a garden", metadata.get(IPTC.HEADLINE));
  }
  public void testJPEGCustomXmp() throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
    InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_IPTC_EXT.jpg");
    ArrayList<Property> passthroughXmpProperties = new ArrayList<Property>(2);
    passthroughXmpProperties.add(Property.internalText("XMP-custom:Text"));
    passthroughXmpProperties.add(Property.internalText("XMP-custom:TextML"));
    Parser passthroughParser = new ExiftoolImageParser(null, passthroughXmpProperties);
    passthroughParser.parse(stream, new DefaultHandler(), metadata, new ParseContext());

    assertEquals("customTextField", metadata.get("XMP-custom:Text"));
    assertEquals("customMultilineField", metadata.get("XMP-custom:TextML"));
  }
  private void manageDetails(final GetItemResponse response, final DataHandler stream) {

    InputStream is = null;
    ContentHandler contenthandler = new BodyContentHandler();
    Metadata metadata = new Metadata();
    // metadata.set(Metadata.RESOURCE_NAME_KEY, f.getName());
    Parser parser = new AutoDetectParser();
    ParseContext context = new ParseContext();

    try {
      is = stream.getInputStream();

      parser.parse(is, contenthandler, metadata, context);
      is.close();
      is.reset();
    } catch (IOException e) {
      e.printStackTrace();
    } catch (SAXException e) {
      e.printStackTrace();
    } catch (TikaException e) {
      e.printStackTrace();
    }

    String contentAuthorValue = metadata.get(Metadata.AUTHOR);
    String contentAuthorKey = currentProperties.getProperty(KpeopleLabel.getCorePropertiesAuthor());
    if (contentAuthorValue != null) {
      eventResult.setDetail(contentAuthorKey, contentAuthorValue);
    }

    String contentCreationDateValue = metadata.get(Metadata.CREATION_DATE);
    String contentCreationDateKey =
        currentProperties.getProperty(KpeopleLabel.getCorePropertiesCreationDate());
    if (contentCreationDateValue != null) {
      eventResult.setDetail(contentCreationDateKey, contentCreationDateValue);
    }

    String contentKeywordsValue = metadata.get(Metadata.KEYWORDS);
    String contentKeywordsKey =
        currentProperties.getProperty(KpeopleLabel.getCorePropertiesKeywords());
    if (contentKeywordsValue != null) {
      eventResult.setDetail(contentKeywordsKey, contentKeywordsValue);
    }

    String[] names = metadata.names();

    /*
     * for (int i = 0; i < names.length; i++) {
     * System.out.println(names[i]); }
     */

  }
Example #9
0
  /** Tests that the ParseContext parser is correctly fired for all the embedded entries. */
  @Test
  public void testEmbedded() throws Exception {
    Parser parser = new AutoDetectParser(); // Should auto-detect!
    ContentHandler handler = new BodyContentHandler();
    Metadata metadata = new Metadata();

    try (InputStream stream =
        RarParserTest.class.getResourceAsStream("/test-documents/test-documents.rar")) {
      parser.parse(stream, handler, metadata, trackingContext);
    }

    // Should have found all 9 documents, but not the directory
    assertEquals(9, tracker.filenames.size());
    assertEquals(9, tracker.mediatypes.size());
    assertEquals(9, tracker.modifiedAts.size());

    // Should have names but not content types, as rar doesn't
    //  store the content types
    assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0));
    assertEquals("test-documents/testHTML.html", tracker.filenames.get(1));
    assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2));
    assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3));
    assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4));
    assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5));
    assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6));
    assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7));
    assertEquals("test-documents/testXML.xml", tracker.filenames.get(8));

    for (String type : tracker.mediatypes) {
      assertNull(type);
    }
    for (String crt : tracker.createdAts) {
      assertNull(crt);
    }
    for (String mod : tracker.modifiedAts) {
      assertNotNull(mod);
      assertTrue("Modified at " + mod, mod.startsWith("20"));
    }

    // Should have filenames in the content string
    String content = handler.toString();
    assertContains("test-documents/testHTML.html", content);
    assertContains("test-documents/testEXCEL.xls", content);
    assertContains("test-documents/testOpenOffice2.odt", content);
    assertContains("test-documents/testPDF.pdf", content);
    assertContains("test-documents/testPPT.ppt", content);
    assertContains("test-documents/testRTF.rtf", content);
    assertContains("test-documents/testTXT.txt", content);
    assertContains("test-documents/testWORD.doc", content);
    assertContains("test-documents/testXML.xml", content);
  }
  public void testTIFFIPTC() throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "image/tiff");
    InputStream stream = getClass().getResourceAsStream("/test-documents/testTIFF_IPTC.tif");
    parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());

    for (String name : metadata.names()) {
      logger.trace("TIFF-- " + name + "=" + metadata.get(name));
    }
    List<String> iptcKeywords = Arrays.asList(metadata.getValues(IPTC.KEYWORDS));
    assertTrue(iptcKeywords.contains("garden"));
    assertTrue(iptcKeywords.contains("cat"));
    assertEquals("Cat in a garden", metadata.get(IPTC.HEADLINE));
  }
Example #11
0
    @Override
    protected boolean doProcess(Record record, InputStream inputStream) {
      Parser parser = detectParser(record);
      if (parser == null) {
        return false;
      }

      ParseContext parseContext = new ParseContext();
      parseContext.set(Locale.class, locale);

      Metadata metadata = new Metadata();
      for (Entry<String, Object> entry : record.getFields().entries()) {
        metadata.add(entry.getKey(), entry.getValue().toString());
      }

      SolrContentHandler handler =
          solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema);
      try {
        inputStream = TikaInputStream.get(inputStream);

        ContentHandler parsingHandler = handler;

        // String xpathExpr = "/xhtml:html/xhtml:body/xhtml:div/descendant:node()";
        if (xpathExpr != null) {
          Matcher matcher = PARSER.parse(xpathExpr);
          parsingHandler = new MatchingContentHandler(parsingHandler, matcher);
        }

        try {
          parser.parse(inputStream, parsingHandler, metadata, parseContext);
        } catch (IOException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        } catch (SAXException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        } catch (TikaException e) {
          throw new MorphlineRuntimeException("Cannot parse", e);
        }
      } finally {
        if (inputStream != null) {
          Closeables.closeQuietly(inputStream);
        }
      }

      SolrInputDocument doc = handler.newDocument();
      LOG.debug("solr doc: {}", doc);
      Record outputRecord = toRecord(doc);
      return getChild().process(outputRecord);
    }
  /** Asks Tika to translate the contents into HTML */
  private void generateHTML(Parser p, RenderingContext context) {
    ContentReader contentReader = context.makeContentReader();

    // Setup things to parse with
    StringWriter sw = new StringWriter();
    ContentHandler handler = buildContentHandler(sw, context);

    // Tell Tika what we're dealing with
    Metadata metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, contentReader.getMimetype());
    metadata.set(
        Metadata.RESOURCE_NAME_KEY,
        nodeService.getProperty(context.getSourceNode(), ContentModel.PROP_NAME).toString());

    // Our parse context needs to extract images
    ParseContext parseContext = new ParseContext();
    parseContext.set(Parser.class, new TikaImageExtractingParser(context));

    // Parse
    try {
      p.parse(contentReader.getContentInputStream(), handler, metadata, parseContext);
    } catch (Exception e) {
      throw new RenditionServiceException("Tika HTML Conversion Failed", e);
    }

    // As a string
    String html = sw.toString();

    // If we're doing body-only, remove all the html namespaces
    //  that will otherwise clutter up the document
    boolean bodyOnly = context.getParamWithDefault(PARAM_BODY_CONTENTS_ONLY, false);
    if (bodyOnly) {
      html = html.replaceAll("<\\?xml.*?\\?>", "");
      html = html.replaceAll("<p xmlns=\"http://www.w3.org/1999/xhtml\"", "<p");
      html = html.replaceAll("<h(\\d) xmlns=\"http://www.w3.org/1999/xhtml\"", "<h\\1");
      html = html.replaceAll("<div xmlns=\"http://www.w3.org/1999/xhtml\"", "<div");
      html = html.replaceAll("<table xmlns=\"http://www.w3.org/1999/xhtml\"", "<table");
      html = html.replaceAll("&#13;", "");
    }

    // Save it
    ContentWriter contentWriter = context.makeContentWriter();
    contentWriter.setMimetype("text/html");
    contentWriter.putContent(html);
  }
Example #13
0
  public void setBinaryContent(byte[] data) {
    InputStream inputStream = new ByteArrayInputStream(data);
    ByteArrayOutputStream outputStream = new ByteArrayOutputStream();

    try {
      TransformerHandler handler =
          getTransformerHandler(outputStream, DEFAULT_OUTPUT_FORMAT, DEFAULT_ENCODING);
      AUTO_DETECT_PARSER.parse(inputStream, handler, new Metadata(), context);

      // Hacking the following line to remove Tika's inserted DocType
      String htmlContent =
          new String(outputStream.toByteArray(), DEFAULT_ENCODING)
              .replace("http://www.w3.org/1999/xhtml", "");
      setHtml(htmlContent);
    } catch (Exception e) {
      logger.error("Error parsing file", e);
    }
  }
Example #14
0
 public void process(InputStream input, OutputStream output, Metadata metadata)
     throws Exception {
   Parser p = parser;
   if (fork) {
     p = new ForkParser(TikaCLI.class.getClassLoader(), p);
   }
   ContentHandler handler = getContentHandler(output, metadata);
   p.parse(input, handler, metadata, context);
   // fix for TIKA-596: if a parser doesn't generate
   // XHTML output, the lack of an output document prevents
   // metadata from being output: this fixes that
   if (handler instanceof NoDocumentMetHandler) {
     NoDocumentMetHandler metHandler = (NoDocumentMetHandler) handler;
     if (!metHandler.metOutput()) {
       metHandler.endDocument();
     }
   }
 }
Example #15
0
  // TIKA-1600: Test that null pointer doesn't break parsing.
  @Test
  public void testNullStylesInODTFooter() throws Exception {
    Parser parser = new OpenDocumentParser();
    try (InputStream input =
        ODFParserTest.class.getResourceAsStream("/test-documents/testODT-TIKA-6000.odt")) {
      Metadata metadata = new Metadata();
      ContentHandler handler = new BodyContentHandler();
      parser.parse(input, handler, metadata, new ParseContext());

      assertEquals("application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE));

      String content = handler.toString();

      assertContains("Utilisation de ce document", content);
      assertContains("Copyright and License", content);
      assertContains("Changer la langue", content);
      assertContains("La page d’accueil permet de faire une recherche simple", content);
    }
  }
  protected static Metadata extractMetadata(
      InputStream inputStream, Metadata metadata, Parser parser) throws IOException {

    if (metadata == null) {
      metadata = new Metadata();
    }

    ParseContext parserContext = new ParseContext();

    parserContext.set(Parser.class, parser);

    ContentHandler contentHandler = new WriteOutContentHandler(new DummyWriter());

    try {
      parser.parse(inputStream, contentHandler, metadata, parserContext);
    } catch (Exception e) {
      Throwable throwable = ExceptionUtils.getRootCause(e);

      if ((throwable instanceof CryptographyException)
          || (throwable instanceof EncryptedDocumentException)
          || (throwable instanceof UnsupportedZipFeatureException)) {

        if (_log.isWarnEnabled()) {
          _log.warn("Unable to extract metadata from an encrypted file");
        }
      } else if (e instanceof TikaException) {
        if (_log.isWarnEnabled()) {
          _log.warn("Unable to extract metadata");
        }
      } else {
        _log.error(e, e);
      }

      throw new IOException(e);
    }

    // Remove potential security risks

    metadata.remove(XMPDM.ABS_PEAK_AUDIO_FILE_PATH.getName());
    metadata.remove(XMPDM.RELATIVE_PEAK_AUDIO_FILE_PATH.getName());

    return metadata;
  }
  private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas)
      throws Exception {
    TesseractOCRConfig config = new TesseractOCRConfig();
    Parser parser =
        new RecursiveParserWrapper(
            new AutoDetectParser(),
            new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1));

    PDFParserConfig pdfConfig = new PDFParserConfig();
    pdfConfig.setExtractInlineImages(true);

    ParseContext parseContext = new ParseContext();
    parseContext.set(TesseractOCRConfig.class, config);
    parseContext.set(Parser.class, parser);
    parseContext.set(PDFParserConfig.class, pdfConfig);

    try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) {
      parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext);
    }
    List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata();
    assertEquals(numMetadatas, metadataList.size());

    StringBuilder contents = new StringBuilder();
    for (Metadata m : metadataList) {
      contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT));
    }
    if (canRun()) {
      if (resource.substring(resource.lastIndexOf('.'), resource.length()).equals(".jpg")) {
        assertTrue(contents.toString().contains("Apache"));
      } else {
        assertTrue(contents.toString().contains("Happy New Year 2003!"));
      }
    }
    for (String needle : nonOCRContains) {
      assertContains(needle, contents.toString());
    }
    assertTrue(metadataList.get(0).names().length > 10);
    assertTrue(metadataList.get(1).names().length > 10);
    // test at least one value
    assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName"));
  }
Example #18
0
  /** @param args */
  public static void main(String[] args) {
    // String fileLocation = "G:/asas/album/song.mp3";
    String fileLocation = "C:\\Users\\Public\\Music\\Sample Music\\Kalimba.mp3";

    try {

      InputStream input = new FileInputStream(new File(fileLocation));
      ContentHandler handler = new DefaultHandler();
      Metadata metadata = new Metadata();
      Parser parser = new Mp3Parser();
      ParseContext parseCtx = new ParseContext();
      parser.parse(input, handler, metadata, parseCtx);
      input.close();

      // List all metadata
      String[] metadataNames = metadata.names();

      for (String name : metadataNames) {
        System.out.println(name + ": " + metadata.get(name));
      }

      // Retrieve the necessary info from metadata
      // Names - title, xmpDM:artist etc. - mentioned below may differ
      // based
      System.out.println("----------------------------------------------");
      System.out.println("Title: " + metadata.get("title"));
      System.out.println("Artists: " + metadata.get("xmpDM:artist"));
      System.out.println("Composer : " + metadata.get("xmpDM:composer"));
      System.out.println("Genre : " + metadata.get("xmpDM:genre"));
      System.out.println("Album : " + metadata.get("xmpDM:album"));

    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    } catch (SAXException e) {
      e.printStackTrace();
    } catch (TikaException e) {
      e.printStackTrace();
    }
  }
Example #19
0
  private Metadata getMetadataFromTika(Product product) throws MetExtractionException {
    try {
      File file = getProductFile(product);
      FileInputStream inputStream = new FileInputStream(file);
      org.apache.tika.metadata.Metadata tikaMetadata = new org.apache.tika.metadata.Metadata();
      Parser parser = new AutoDetectParser();
      parser.parse(inputStream, new DefaultHandler(), tikaMetadata, new ParseContext());
      return transform(tikaMetadata);

    } catch (FileNotFoundException e) {
      throw new MetExtractionException("Unable to find file: Reason: " + e.getMessage());
    } catch (TikaException e) {
      throw new MetExtractionException("Unable to parse the document: Reason: " + e.getMessage());
    } catch (SAXException e) {
      throw new MetExtractionException(
          " Unable to process the SAX events : Reason: " + e.getMessage());
    } catch (IOException e) {
      throw new MetExtractionException(
          "Unable to read the document stream: Reason: " + e.getMessage());
    }
  }
Example #20
0
  @Test
  public void testOO3() throws Exception {
    for (Parser parser : getParsers()) {
      try (InputStream input =
          ODFParserTest.class.getResourceAsStream("/test-documents/testODFwithOOo3.odt")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        parser.parse(input, handler, metadata, new ParseContext());

        assertEquals(
            "application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE));

        String content = handler.toString();
        assertContains("Tika is part of the Lucene project.", content);
        assertContains("Solr", content);
        assertContains("one embedded", content);
        assertContains("Rectangle Title", content);
        assertContains("a blue background and dark border", content);
      }
    }
  }
  @Override
  public DocumentWrap crawl(String uriroot, File file) throws CrawlException {
    DocumentWrap document = new DocumentWrap();

    try {
      openFile(file);

      ContentHandler textHandler = new BodyContentHandler();
      Metadata metadata = new Metadata();
      ParseContext parseContext = new ParseContext();

      Parser parser;
      if (file.getName().toLowerCase().endsWith("x")) parser = new OOXMLParser();
      else parser = new OfficeParser();

      parser.parse(getFileStream(), textHandler, metadata, parseContext);

      document.setAuthor(metadata.get(Metadata.AUTHOR));
      document.setSummary(metadata.get(Metadata.COMMENTS));
      document.setContent(textHandler.toString(), bStoreBody);
      document.setSize((int) file.length());

      document.setId(file.getCanonicalPath());

      if (uriroot != null) document.setURL(getUrl(uriroot, file));

    } catch (FileNotFoundException e) {
      throw new CrawlException("File not found: " + file, e);
    } catch (IOException e) {
      throw new CrawlException("File: " + file, e);
    } catch (Exception e) {
      throw new CrawlException("File: " + file, e);
    } finally {

      closeFile();
    }

    return document;
  }
  public DocumentWrap crawl(String uriroot, File file) throws CrawlException {
    DocumentWrap document = new DocumentWrap();

    try {
      openFile(file);

      ContentHandler textHandler = new BodyContentHandler();
      Metadata metadata = new Metadata();
      ParseContext parseContext = new ParseContext();

      Parser parser = new PDFParser();
      parser.parse(getFileStream(), textHandler, metadata, parseContext);

      // Setup the document
      document.setContent(textHandler.toString(), bStoreBody);
      document.setSize((int) file.length());
      document.setType("application/pdf");

      document.setAuthor(metadata.get(Metadata.AUTHOR));
      document.setName(metadata.get(Metadata.TITLE));
      document.setSummary(metadata.get(Metadata.SUBJECT));
      document.setAttribute("keywords", metadata.get(Metadata.KEYWORDS));

      document.setId(file.getCanonicalPath());

      if (uriroot != null) document.setURL(getUrl(uriroot, file));

    } catch (FileNotFoundException e) {
      throw new CrawlException("File not found: " + file, e);
    } catch (IOException e) {
      throw new CrawlException("File: " + file, e);
    } catch (Exception e) {
      throw new CrawlException("File: " + file, e);
    } finally {
      closeFile();
    }

    return document;
  }
Example #23
0
  public static void enrichDocumentWithFileContents(
      LocalDocument doc,
      String fieldPrefix,
      InputStream stream,
      Parser parser,
      boolean addMetaData,
      boolean addLanguage)
      throws IOException, SAXException, TikaException {
    Metadata metadata = new Metadata();
    ParseContext parseContext = new ParseContext();
    parseContext.set(Parser.class, parser);
    StringWriter textData = new StringWriter();
    parser.parse(stream, new BodyContentHandler(textData), metadata, parseContext);

    addTextToDocument(doc, fieldPrefix, textData);
    if (addMetaData) {
      addMetadataToDocument(doc, fieldPrefix, metadata);
    }
    if (addLanguage) {
      addLanguageToDocument(doc, fieldPrefix, textData.toString());
    }
  }
Example #24
0
  /**
   * Common implementation -- take an input stream and return a ConvertedDoc;
   *
   * @param input stream for raw file
   * @param doc raw file
   * @return converted doc
   * @throws IOException if underlying Tika parser/writer had an IO problem, an parser problem, or
   *     MAX_TEXT_SIZE is reached.
   */
  @Override
  protected ConvertedDocument conversionImplementation(InputStream input, java.io.File doc)
      throws IOException {
    Metadata metadata = new Metadata();
    BodyContentHandler handler = new BodyContentHandler(maxBuffer);

    try {
      parser.parse(input, handler, metadata, ctx);
    } catch (NoClassDefFoundError classErr) {
      throw new IOException("Unable to parse content due to Tika misconfiguration", classErr);
    } catch (Exception xerr) {
      throw new IOException("Unable to parse content", xerr);
    } finally {
      input.close();
    }
    ConvertedDocument textdoc = new ConvertedDocument(doc);

    textdoc.addTitle(metadata.get(TikaCoreProperties.TITLE));
    textdoc.setEncoding(metadata.get(Metadata.CONTENT_ENCODING));
    textdoc.addCreateDate(metadata.getDate(TikaCoreProperties.CREATED));
    textdoc.addAuthor(metadata.get(TikaCoreProperties.CREATOR));

    // v1.5:  until this version this blank line reducer was in place.
    //     Using Java6 it appeared to cause StackOverflow when it encountered a document hundreds of
    // \n in a row.
    //     Eg.., a Spreadsheet doc converted to text may have thousands of empty lines following the
    // last data row.
    // TextUtils.reduce_line_breaks(txt)
    String t = handler.toString();
    if (t != null) {
      if (textdoc.filename != null && FileUtility.isSpreadsheet(textdoc.filename)) {
        textdoc.setText(t.trim());
      } else {
        textdoc.setText(TextUtils.reduce_line_breaks(t));
      }
    }
    return textdoc;
  }
  public void testJPEGIPTC() throws Exception {
    Metadata metadata = new Metadata();
    metadata.set(Metadata.CONTENT_TYPE, "image/jpeg");
    InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_IPTC_EXT.jpg");
    parser.parse(stream, new DefaultHandler(), metadata, new ParseContext());

    assertEquals("Washington", metadata.get(IPTC.CITY));
    assertEquals("United States", metadata.get(IPTC.COUNTRY));
    assertEquals("US", metadata.get(IPTC.COUNTRY_CODE));

    assertEquals(
        "A stream bank in Rock Creek Park Washington DC during a photo bike tour with ASPP DC/South chapter.",
        metadata.get(IPTC.DESCRIPTION));
    assertEquals(
        "A stream bank in Rock Creek Park Washington DC during a photo bike tour with ASPP DC/South chapter.",
        metadata.get(Metadata.DESCRIPTION));

    assertEquals("Rock Creek Park", metadata.get(IPTC.HEADLINE));
    assertEquals("Downstream", metadata.get(Metadata.TITLE));

    assertEquals("intellectual genre", metadata.get(IPTC.INTELLECTUAL_GENRE));

    List<String> iptcKeywords = Arrays.asList(metadata.getValues(IPTC.KEYWORDS));
    assertTrue(iptcKeywords.contains("stream"));
    assertTrue(iptcKeywords.contains("park"));
    assertTrue(iptcKeywords.contains("bank"));
    List<String> tikaKeywords = Arrays.asList(metadata.getValues(Metadata.KEYWORDS));
    assertTrue(Arrays.toString(tikaKeywords.toArray()).contains("stream"));
    assertTrue(Arrays.toString(tikaKeywords.toArray()).contains("park"));
    assertTrue(Arrays.toString(tikaKeywords.toArray()).contains("bank"));

    assertEquals("DC", metadata.get(IPTC.PROVINCE_OR_STATE));

    List<String> iptcSceneCode = Arrays.asList(metadata.getValues(IPTC.SCENE_CODE));
    assertEquals(2, iptcSceneCode.size());
    assertTrue(Arrays.toString(iptcSceneCode.toArray()).contains("iptc scene 1"));
    assertTrue(Arrays.toString(iptcSceneCode.toArray()).contains("iptc scene 2"));

    List<String> iptcSubjectCode = Arrays.asList(metadata.getValues(IPTC.SUBJECT_CODE));
    assertEquals(2, iptcSubjectCode.size());
    assertTrue(Arrays.toString(iptcSubjectCode.toArray()).contains("iptc subject code 1"));
    assertTrue(Arrays.toString(iptcSubjectCode.toArray()).contains("iptc subject code 2"));

    assertEquals("Rock Creek Park", metadata.get(IPTC.SUBLOCATION));

    GregorianCalendar calendar = new GregorianCalendar();
    calendar.set(Calendar.YEAR, 2011);
    calendar.set(Calendar.MONTH, 7);
    calendar.set(Calendar.DATE, 31);
    calendar.set(Calendar.HOUR_OF_DAY, 12);
    calendar.set(Calendar.MINUTE, 0);
    calendar.set(Calendar.SECOND, 0);
    calendar.set(Calendar.MILLISECOND, 0);
    calendar.setTimeZone(TimeZone.getTimeZone("UTC"));
    assertEquals(calendar.getTime(), metadata.getDate(IPTC.DATE_CREATED));

    assertEquals("Ray Gauss II", metadata.get(IPTC.DESCRIPTION_WRITER));
    assertEquals("instructions", metadata.get(IPTC.INSTRUCTIONS));
    assertEquals("job identifier", metadata.get(IPTC.JOB_ID));
    assertEquals("Downstream", metadata.get(IPTC.TITLE));
    assertTrue(metadata.get(IPTC.COPYRIGHT_NOTICE).contains("Ray Gauss II"));

    List<String> creators = Arrays.asList(metadata.getValues(IPTC.CREATOR));
    assertTrue(Arrays.toString(creators.toArray()).contains("Ray Gauss II"));

    assertEquals("DAM Architect", metadata.get(IPTC.CREATORS_JOB_TITLE));
    assertEquals("provider", metadata.get(IPTC.CREDIT_LINE));
    assertEquals("rights usage terms", metadata.get(IPTC.RIGHTS_USAGE_TERMS));
    assertEquals("source", metadata.get(IPTC.SOURCE));
    assertEquals("1234 Some Road", metadata.get(IPTC.CONTACT_INFO_ADDRESS));
    assertEquals("Atlanta", metadata.get(IPTC.CONTACT_INFO_CITY));
    assertEquals("US", metadata.get(IPTC.CONTACT_INFO_COUNTRY));

    List<String> ciWorkEmails = Arrays.asList(metadata.getValues(IPTC.CONTACT_INFO_EMAIL));
    // Photoshop does not support true multi-value here
    assertTrue(Arrays.toString(ciWorkEmails.toArray()).contains("*****@*****.**"));
    assertTrue(Arrays.toString(ciWorkEmails.toArray()).contains("*****@*****.**"));

    List<String> ciWorkTels = Arrays.asList(metadata.getValues(IPTC.CONTACT_INFO_PHONE));
    // Photoshop does not support true multi-value here
    assertTrue(Arrays.toString(ciWorkTels.toArray()).contains("555-1234"));
    assertTrue(Arrays.toString(ciWorkTels.toArray()).contains("555-4321"));

    assertEquals("30339", metadata.get(IPTC.CONTACT_INFO_POSTAL_CODE));
    assertEquals("GA", metadata.get(IPTC.CONTACT_INFO_STATE_PROVINCE));

    List<String> ciWorkUrls = Arrays.asList(metadata.getValues(IPTC.CONTACT_INFO_WEB_URL));
    // Photoshop does not support true multi-value here
    assertTrue(Arrays.toString(ciWorkUrls.toArray()).contains("http://alfresco.com"));
    assertTrue(Arrays.toString(ciWorkUrls.toArray()).contains("http://example.com"));

    assertEquals("rocky 1 and rocky 2 are big", metadata.get(IPTC.ADDITIONAL_MODEL_INFO));

    List<String> orgCodes = Arrays.asList(metadata.getValues(IPTC.ORGANISATION_CODE));
    assertEquals(2, orgCodes.size());
    assertEquals("ASPP", orgCodes.get(0));
    assertEquals("OTHER_ORG", orgCodes.get(1));

    // List<String> cvTerms = Arrays.asList(metadata.getValues(IPTC.CONTROLLED_VOCABULARY_TERM));

    List<String> modelAges = Arrays.asList(metadata.getValues(IPTC.MODEL_AGE));
    assertEquals(2, modelAges.size());
    assertEquals("1000", modelAges.get(0));
    assertEquals("1001", modelAges.get(1));

    List<String> orgNames = Arrays.asList(metadata.getValues(IPTC.ORGANISATION_NAME));
    assertEquals(2, orgNames.size());
    assertEquals("ASPP", orgNames.get(0));
    assertEquals("Other Org", orgNames.get(1));

    List<String> peopleShown = Arrays.asList(metadata.getValues(IPTC.PERSON));
    assertEquals(2, peopleShown.size());
    assertEquals("rocky 1", peopleShown.get(0));
    assertEquals("rocky 2", peopleShown.get(1));

    assertEquals(
        "http://cv.iptc.org/newscodes/digitalsourcetype/digitalCapture",
        metadata.get(IPTC.DIGITAL_SOURCE_TYPE));
    assertEquals("Photo Bike Tour", metadata.get(IPTC.EVENT));

    assertEquals("RGAUSS", metadata.get(IPTC.IMAGE_SUPPLIER_ID));
    assertEquals("Ray Gauss II", metadata.get(IPTC.IMAGE_SUPPLIER_NAME));
    assertEquals("supplier image ID", metadata.get(IPTC.IMAGE_SUPPLIER_IMAGE_ID));
    assertEquals("3456", metadata.get(IPTC.MAX_AVAIL_HEIGHT));
    assertEquals("5184", metadata.get(IPTC.MAX_AVAIL_WIDTH));
    assertEquals("1.2.0", metadata.get(IPTC.PLUS_VERSION));

    List<String> copyrightOwnerIds = Arrays.asList(metadata.getValues(IPTC.COPYRIGHT_OWNER_ID));
    assertEquals(1, copyrightOwnerIds.size());
    assertEquals("RGAUSS", copyrightOwnerIds.get(0));
    // assertEquals("", copyrightOwnerIds.get(1)); // TODO: Get ExifTool to preserve empty values

    List<String> copyrightOwnerNames = Arrays.asList(metadata.getValues(IPTC.COPYRIGHT_OWNER_NAME));
    assertEquals(2, copyrightOwnerNames.size());
    assertEquals("Ray Gauss II", copyrightOwnerNames.get(0));
    assertEquals("GG", copyrightOwnerNames.get(1));

    List<String> imageCreatorIds = Arrays.asList(metadata.getValues(IPTC.IMAGE_CREATOR_ID));
    assertEquals(1, imageCreatorIds.size());
    assertEquals("RGAUSS", imageCreatorIds.get(0));
    // assertEquals("", imageCreatorIds.get(1)); // TODO: Get ExifTool to preserve empty values

    assertTrue(metadata.isMultiValued(IPTC.IMAGE_CREATOR_NAME));
    List<String> imageCreatorNames = Arrays.asList(metadata.getValues(IPTC.IMAGE_CREATOR_NAME));
    assertEquals(2, imageCreatorNames.size());
    assertEquals("Ray Gauss II", imageCreatorNames.get(0));
    assertEquals("GG", imageCreatorNames.get(1));

    List<String> licensorIds = Arrays.asList(metadata.getValues(IPTC.LICENSOR_ID));
    assertEquals("RGAUSS", licensorIds.get(0));

    assertTrue(metadata.isMultiValued(IPTC.LICENSOR_NAME));
    List<String> licensorNames = Arrays.asList(metadata.getValues(IPTC.LICENSOR_NAME));
    assertEquals(2, licensorNames.size());
    assertEquals("Ray Gauss II", licensorNames.get(0));
    assertEquals("GG", licensorNames.get(1));

    // Photoshop does not support licensor addresses, cities, or countries

    List<String> licensorEmails = Arrays.asList(metadata.getValues(IPTC.LICENSOR_EMAIL));
    assertEquals("*****@*****.**", licensorEmails.get(0));
    // assertEquals("", licensorEmails.get(1)); // TODO: Get ExifTool to preserve empty values
    List<String> licensorTel1 = Arrays.asList(metadata.getValues(IPTC.LICENSOR_TELEPHONE_1));
    assertEquals("555-5555", licensorTel1.get(0));
    // assertEquals("", licensorTel1.get(1)); // TODO: Get ExifTool to preserve empty values
    List<String> licensorTel2 = Arrays.asList(metadata.getValues(IPTC.LICENSOR_TELEPHONE_2));
    assertEquals("555-4444", licensorTel2.get(0));
    // assertEquals("", licensorTel2.get(1)); // TODO: Get ExifTool to preserve empty values
    List<String> licensorUrls = Arrays.asList(metadata.getValues(IPTC.LICENSOR_URL));
    assertEquals("http://rgauss.com", licensorUrls.get(0));
    // assertEquals("", licensorUrls.get(1)); // TODO: Get ExifTool to preserve empty values

    assertEquals("Age Unknown", metadata.get(IPTC.MINOR_MODEL_AGE_DISCLOSURE));
    List<String> modelReleaseIds = Arrays.asList(metadata.getValues(IPTC.MODEL_RELEASE_ID));
    assertEquals("model release id 1", modelReleaseIds.get(0));
    assertEquals("model release id 2", modelReleaseIds.get(1));
    assertEquals("Not Applicable", metadata.get(IPTC.MODEL_RELEASE_STATUS));

    List<String> propertyReleaseIds = Arrays.asList(metadata.getValues(IPTC.PROPERTY_RELEASE_ID));
    assertEquals("prop release id 1", propertyReleaseIds.get(0));
    assertEquals("prop release id 2", propertyReleaseIds.get(1));
    assertEquals("Not Applicable", metadata.get(IPTC.PROPERTY_RELEASE_STATUS));

    List<String> aoCopyright =
        Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_COPYRIGHT_NOTICE));
    assertEquals("Ray Gauss II", aoCopyright.get(0));
    // assertEquals("", aoCopyright.get(1)); // TODO: Get ExifTool to preserve empty values
    // assertEquals("", aoCopyright.get(2)); // TODO: Get ExifTool to preserve empty values
    List<String> aoCreator =
        Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_CREATOR));
    assertEquals("Mother Nature", aoCreator.get(0));
    assertEquals("Man", aoCreator.get(1));
    assertEquals("Mother Nature", aoCreator.get(2));
    List<String> aoDateCreated =
        Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_DATE_CREATED));
    assertEquals("1890:01:01", aoDateCreated.get(0));
    // assertEquals("", aoDateCreated.get(1)); // TODO: Get ExifTool to preserve empty values
    assertEquals("1901:02:01", aoDateCreated.get(1));
    // assertEquals("", aoDateCreated.get(2)); // TODO: Get ExifTool to preserve empty values
    List<String> aoSource = Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_SOURCE));
    assertEquals("National Park Service", aoSource.get(0));
    // assertEquals("", aoSource.get(1)); // TODO: Get ExifTool to preserve empty values
    // assertEquals("", aoSource.get(2)); // TODO: Get ExifTool to preserve empty values
    List<String> aoSourceInventoryNum =
        Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_SOURCE_INVENTORY_NUMBER));
    assertEquals("123456", aoSourceInventoryNum.get(0));
    // assertEquals("", aoSourceInventoryNum.get(1)); // TODO: Get ExifTool to preserve empty values
    assertEquals(
        "654321",
        aoSourceInventoryNum.get(
            1)); // This should be index 2, TODO: Get ExifTool to preserve empty values
    List<String> aoSourceTitles =
        Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_TITLE));
    assertEquals("Rock Creek Stream Bank", aoSourceTitles.get(0));
    assertEquals("Pollution", aoSourceTitles.get(1));
    assertEquals("Some Tree", aoSourceTitles.get(2));

    List<String> locationShownCity = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_CITY));
    assertEquals("Washington", locationShownCity.get(0));
    // assertEquals("", locationShownCity.get(1)); // TODO: Get ExifTool to preserve empty values
    List<String> locationShownCountryCode =
        Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_COUNTRY_CODE));
    assertEquals("US", locationShownCountryCode.get(0));
    // assertEquals("", locationShownCountryCode.get(1)); // TODO: Get ExifTool to preserve empty
    // values
    List<String> locationShownCountryName =
        Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_COUNTRY_NAME));
    assertEquals("United States", locationShownCountryName.get(0));
    // assertEquals("", locationShownCountryName.get(1)); // TODO: Get ExifTool to preserve empty
    // values
    List<String> locationShownState =
        Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_PROVINCE_OR_STATE));
    assertEquals("D.C.", locationShownState.get(0));
    // assertEquals("", locationShownState.get(1)); // TODO: Get ExifTool to preserve empty values
    List<String> locationShownSublocation =
        Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_SUBLOCATION));
    assertEquals("Rock Creek Park Sub", locationShownSublocation.get(0));
    assertEquals("Stream Section", locationShownSublocation.get(1));
    List<String> locationShownWorldRegion =
        Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_WORLD_REGION));
    assertEquals("North America", locationShownWorldRegion.get(0));
    // assertEquals("", locationShownWorldRegion.get(1)); // TODO: Get ExifTool to preserve empty
    // values

    assertEquals("Washington", metadata.get(IPTC.LOCATION_CREATED_CITY));
    assertEquals("US", metadata.get(IPTC.LOCATION_CREATED_COUNTRY_CODE));
    assertEquals("United States", metadata.get(IPTC.LOCATION_CREATED_COUNTRY_NAME));
    assertEquals("D.C.", metadata.get(IPTC.LOCATION_CREATED_PROVINCE_OR_STATE));
    assertEquals("Rock Creek Park", metadata.get(IPTC.LOCATION_CREATED_SUBLOCATION));
    assertEquals("North America", metadata.get(IPTC.LOCATION_CREATED_WORLD_REGION));

    assertTrue(IPTC.REGISTRY_ENTRY_CREATED_ORGANISATION_ID.isMultiValuePermitted());
    assertTrue(metadata.isMultiValued(IPTC.REGISTRY_ENTRY_CREATED_ORGANISATION_ID));
    List<String> registryEntryOrgIds =
        Arrays.asList(metadata.getValues(IPTC.REGISTRY_ENTRY_CREATED_ORGANISATION_ID));
    assertEquals(2, registryEntryOrgIds.size());
    assertEquals("PLUS", registryEntryOrgIds.get(0));
    // assertEquals("", registryEntryOrgIds.get(1)); // TODO: Get ExifTool to preserve empty values
    assertEquals(
        "ORG 2",
        registryEntryOrgIds.get(
            1)); // This should be index 2, TODO: Get ExifTool to preserve empty values

    assertTrue(IPTC.REGISTRY_ENTRY_CREATED_ORGANISATION_ID.isMultiValuePermitted());
    assertTrue(metadata.isMultiValued(IPTC.REGISTRY_ENTRY_CREATED_ITEM_ID));
    List<String> registryEntryItemIds =
        Arrays.asList(metadata.getValues(IPTC.REGISTRY_ENTRY_CREATED_ITEM_ID));
    assertEquals(registryEntryItemIds.size(), 3);
    assertEquals("100-ABC-ABC-555", registryEntryItemIds.get(0));
    assertEquals("11223344", registryEntryItemIds.get(1));
    assertEquals("55667788", registryEntryItemIds.get(2));
  }
  private static void setup() throws Exception {
    conf = NutchConfiguration.create();
    conf.setBoolean("parser.html.form.use_action", true);
    utils = new DOMContentUtils(conf);
    TikaParser tikaParser = new TikaParser();
    tikaParser.setConf(conf);
    Parser parser = tikaParser.getTikaConfig().getParser("text/html");
    for (int i = 0; i < testPages.length; i++) {
      Metadata tikamd = new Metadata();

      HTMLDocumentImpl doc = new HTMLDocumentImpl();
      doc.setErrorChecking(false);
      DocumentFragment root = doc.createDocumentFragment();
      DOMBuilder domhandler = new DOMBuilder(doc, root);
      ParseContext context = new ParseContext();
      // to add once available in Tika
      // context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE);
      try {
        parser.parse(
            new ByteArrayInputStream(testPages[i].getBytes()), domhandler, tikamd, context);
        testBaseHrefURLs[i] = new URL(testBaseHrefs[i]);
      } catch (Exception e) {
        e.printStackTrace();
        fail("caught exception: " + e);
      }
      testDOMs[i] = root;
      LSSerializerImpl lsi = new LSSerializerImpl();
      System.out.println("input " + i + ": '" + testPages[i] + "'");
      System.out.println("output " + i + ": '" + lsi.writeToString(root) + "'");
    }
    answerOutlinks =
        new Outlink[][] {
          // 0
          {
            new Outlink("http://www.nutch.org", "anchor"),
          },
          // 1
          {
            new Outlink("http://www.nutch.org/", "home"),
            new Outlink("http://www.nutch.org/docs/bot.html", "bots"),
          },
          // 2
          {
            new Outlink("http://www.nutch.org/", "separate this"),
            new Outlink("http://www.nutch.org/docs/ok", "from this"),
          },

          // 3
          {
            new Outlink("http://www.nutch.org/", "home"),
            new Outlink("http://www.nutch.org/docs/1", "1"),
            new Outlink("http://www.nutch.org/docs/2", "2"),
          },
          // 4
          {
            new Outlink("http://www.nutch.org/frames/top.html", ""),
            new Outlink("http://www.nutch.org/frames/left.html", ""),
            new Outlink("http://www.nutch.org/frames/invalid.html", ""),
            new Outlink("http://www.nutch.org/frames/right.html", ""),
          },
          // 5
          {
            new Outlink("http://www.nutch.org/maps/logo.gif", ""),
            new Outlink("http://www.nutch.org/index.html", ""),
            new Outlink("http://www.nutch.org/maps/#bottom", ""),
            new Outlink("http://www.nutch.org/bot.html", ""),
            new Outlink("http://www.nutch.org/docs/index.html", "")
          },
          // 6
          {
            new Outlink("http://www.nutch.org/index.html", "whitespace test"),
          },
          // 7
          {},
          // 8
          {
            new Outlink("http://www.nutch.org/dummy.jsp", "test2"),
          },
          // 9
          {},
          // 10
          {
            new Outlink("http://www.nutch.org/;x", "anchor1"),
            new Outlink("http://www.nutch.org/g;x", "anchor2"),
            new Outlink("http://www.nutch.org/g;x?y#s", "anchor3")
          },
          // 11
          {
            // this is tricky - see RFC3986 section 5.4.1 example 7
            new Outlink("http://www.nutch.org/g", "anchor1"),
            new Outlink("http://www.nutch.org/g?y#s", "anchor2"),
            new Outlink("http://www.nutch.org/;something?y=1", "anchor3"),
            new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"),
            new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5")
          }
        };
  }
Example #27
0
  public ParseResult getParse(Content content) {
    String mimeType = content.getContentType();

    URL base;
    try {
      base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
      return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }

    // get the right parser using the mime type as a clue
    Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
    byte[] raw = content.getContent();

    if (parser == null) {
      String message = "Can't retrieve Tika parser for mime-type " + mimeType;
      LOG.error(message);
      return new ParseStatus(ParseStatus.FAILED, message)
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    LOG.debug("Using Tika parser " + parser.getClass().getName() + " for mime-type " + mimeType);

    Metadata tikamd = new Metadata();

    HTMLDocumentImpl doc = new HTMLDocumentImpl();
    doc.setErrorChecking(false);
    DocumentFragment root = doc.createDocumentFragment();
    DOMBuilder domhandler = new DOMBuilder(doc, root);
    ParseContext context = new ParseContext();
    try {
      parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd, context);
    } catch (Exception e) {
      LOG.error("Error parsing " + content.getUrl(), e);
      return new ParseStatus(ParseStatus.FAILED, e.getMessage())
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    HTMLMetaTags metaTags = new HTMLMetaTags();
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();

    // we have converted the sax events generated by Tika into a DOM object
    // so we can now use the usual HTML resources from Nutch
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }

    // check meta directives
    if (!metaTags.getNoIndex()) { // okay to index
      StringBuffer sb = new StringBuffer();
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting text...");
      }
      utils.getText(sb, root); // extract text
      text = sb.toString();
      sb.setLength(0);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting title...");
      }
      utils.getTitle(sb, root); // extract title
      title = sb.toString().trim();
    }

    if (!metaTags.getNoFollow()) { // okay to follow links
      ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
      URL baseTag = utils.getBase(root);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting links...");
      }
      utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
      outlinks = l.toArray(new Outlink[l.size()]);
      if (LOG.isTraceEnabled()) {
        LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
      }
    }

    // populate Nutch metadata with Tika metadata
    String[] TikaMDNames = tikamd.names();
    for (String tikaMDName : TikaMDNames) {
      if (tikaMDName.equalsIgnoreCase(Metadata.TITLE)) continue;
      // TODO what if multivalued?
      nutchMetadata.add(tikaMDName, tikamd.get(tikaMDName));
    }

    // no outlinks? try OutlinkExtractor e.g works for mime types where no
    // explicit markup for anchors

    if (outlinks.length == 0) {
      outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    }

    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
      status.setArgs(
          new String[] {
            metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime())
          });
    }
    ParseData parseData =
        new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata);
    ParseResult parseResult =
        ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));

    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) { // not okay to cache
      for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
        entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    }
    return filteredParse;
  }
  @Override
  public void load(
      SolrQueryRequest req,
      SolrQueryResponse rsp,
      ContentStream stream,
      UpdateRequestProcessor processor)
      throws Exception {
    Parser parser = null;
    String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
    if (streamType != null) {
      // Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
      MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
      parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
    } else {
      parser = autoDetectParser;
    }
    if (parser != null) {
      Metadata metadata = new Metadata();

      // If you specify the resource name (the filename, roughly) with this parameter,
      // then Tika can make use of it in guessing the appropriate MIME type:
      String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
      if (resourceName != null) {
        metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
      }
      // Provide stream's content type as hint for auto detection
      if (stream.getContentType() != null) {
        metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
      }

      InputStream inputStream = null;
      try {
        inputStream = stream.getStream();
        metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
        metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
        metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
        metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
        // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
        String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
        if (charset != null) {
          metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
        }

        String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
        boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
        SolrContentHandler handler =
            factory.createSolrContentHandler(metadata, params, req.getSchema());
        ContentHandler parsingHandler = handler;

        StringWriter writer = null;
        BaseMarkupSerializer serializer = null;
        if (extractOnly == true) {
          String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
          writer = new StringWriter();
          if (extractFormat.equals(TEXT_FORMAT)) {
            serializer = new TextSerializer();
            serializer.setOutputCharStream(writer);
            serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
          } else {
            serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
          }
          if (xpathExpr != null) {
            Matcher matcher = PARSER.parse(xpathExpr);
            serializer
                .startDocument(); // The MatchingContentHandler does not invoke startDocument.  See
                                  // http://tika.markmail.org/message/kknu3hw7argwiqin
            parsingHandler = new MatchingContentHandler(serializer, matcher);
          } else {
            parsingHandler = serializer;
          }
        } else if (xpathExpr != null) {
          Matcher matcher = PARSER.parse(xpathExpr);
          parsingHandler = new MatchingContentHandler(handler, matcher);
        } // else leave it as is

        try {
          // potentially use a wrapper handler for parsing, but we still need the SolrContentHandler
          // for getting the document.
          ParseContext context = parseContextConfig.create();

          context.set(Parser.class, parser);
          context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);

          // Password handling
          RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
          String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
          if (pwMapFile != null && pwMapFile.length() > 0) {
            InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
            if (is != null) {
              log.debug("Password file supplied: " + pwMapFile);
              epp.parse(is);
            }
          }
          context.set(PasswordProvider.class, epp);
          String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
          if (resourcePassword != null) {
            epp.setExplicitPassword(resourcePassword);
            log.debug("Literal password supplied for file " + resourceName);
          }
          parser.parse(inputStream, parsingHandler, metadata, context);
        } catch (TikaException e) {
          if (ignoreTikaException)
            log.warn(
                new StringBuilder("skip extracting text due to ")
                    .append(e.getLocalizedMessage())
                    .append(". metadata=")
                    .append(metadata.toString())
                    .toString());
          else throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        }
        if (extractOnly == false) {
          addDoc(handler);
        } else {
          // serializer is not null, so we need to call endDoc on it if using xpath
          if (xpathExpr != null) {
            serializer.endDocument();
          }
          rsp.add(stream.getName(), writer.toString());
          writer.close();
          String[] names = metadata.names();
          NamedList metadataNL = new NamedList();
          for (int i = 0; i < names.length; i++) {
            String[] vals = metadata.getValues(names[i]);
            metadataNL.add(names[i], vals);
          }
          rsp.add(stream.getName() + "_metadata", metadataNL);
        }
      } catch (SAXException e) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
      } finally {
        IOUtils.closeQuietly(inputStream);
      }
    } else {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST,
          "Stream type of "
              + streamType
              + " didn't match any known parsers.  Please supply the "
              + ExtractingParams.STREAM_TYPE
              + " parameter.");
    }
  }
Example #29
0
  @Test
  public void testOO2() throws Exception {
    for (Parser parser : getParsers()) {
      try (InputStream input =
          ODFParserTest.class.getResourceAsStream("/test-documents/testOpenOffice2.odt")) {
        Metadata metadata = new Metadata();
        ContentHandler handler = new BodyContentHandler();
        parser.parse(input, handler, metadata, new ParseContext());

        assertEquals(
            "application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE));
        assertEquals("en-US", metadata.get(Metadata.LANGUAGE));
        assertEquals("PT1M7S", metadata.get(Metadata.EDIT_TIME));
        assertEquals(
            "NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161",
            metadata.get("generator"));

        // Check date metadata, both old-style and new-style
        assertEquals("2007-09-14T11:07:10", metadata.get(TikaCoreProperties.MODIFIED));
        assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.MODIFIED));
        assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.DATE));
        assertEquals("2007-09-14T11:06:08", metadata.get(TikaCoreProperties.CREATED));
        assertEquals("2007-09-14T11:06:08", metadata.get(Metadata.CREATION_DATE));

        // Check the document statistics
        assertEquals("1", metadata.get(Office.PAGE_COUNT));
        assertEquals("1", metadata.get(Office.PARAGRAPH_COUNT));
        assertEquals("14", metadata.get(Office.WORD_COUNT));
        assertEquals("78", metadata.get(Office.CHARACTER_COUNT));
        assertEquals("0", metadata.get(Office.TABLE_COUNT));
        assertEquals("0", metadata.get(Office.OBJECT_COUNT));
        assertEquals("0", metadata.get(Office.IMAGE_COUNT));

        // Check the Tika-1.0 style document statistics
        assertEquals("1", metadata.get(Metadata.PAGE_COUNT));
        assertEquals("1", metadata.get(Metadata.PARAGRAPH_COUNT));
        assertEquals("14", metadata.get(Metadata.WORD_COUNT));
        assertEquals("78", metadata.get(Metadata.CHARACTER_COUNT));
        assertEquals("0", metadata.get(Metadata.TABLE_COUNT));
        assertEquals("0", metadata.get(Metadata.OBJECT_COUNT));
        assertEquals("0", metadata.get(Metadata.IMAGE_COUNT));

        // Check the very old style statistics (these will be removed shortly)
        assertEquals("0", metadata.get("nbTab"));
        assertEquals("0", metadata.get("nbObject"));
        assertEquals("0", metadata.get("nbImg"));
        assertEquals("1", metadata.get("nbPage"));
        assertEquals("1", metadata.get("nbPara"));
        assertEquals("14", metadata.get("nbWord"));
        assertEquals("78", metadata.get("nbCharacter"));

        // Custom metadata tags present but without values
        assertEquals(null, metadata.get("custom:Info 1"));
        assertEquals(null, metadata.get("custom:Info 2"));
        assertEquals(null, metadata.get("custom:Info 3"));
        assertEquals(null, metadata.get("custom:Info 4"));

        String content = handler.toString();
        assertTrue(
            content.contains(
                "This is a sample Open Office document,"
                    + " written in NeoOffice 2.2.1 for the Mac."));
      }
    }
  }
    @Override
    public void operate(FlowProcess flowProcess, FunctionCall<NullContext> functionCall) {
      String email = functionCall.getArguments().getString("email");
      _numEmails += 1;

      Metadata metadata = new Metadata();

      try {
        InputStream stream = new ByteArrayInputStream(email.getBytes("UTF-8"));
        _parser.parse(stream, _handler, metadata, new ParseContext());

        // _content now has all of the body text, and metadata has the header info.
        String messageId = getMetadata(metadata, TikaCoreProperties.IDENTIFIER);

        String author = "";
        String address = "";
        String creator = getMetadata(metadata, TikaCoreProperties.CREATOR);
        Matcher addressMatcher = FULL_EMAIL_ADDRESS_PATTERN.matcher(creator);
        if (addressMatcher.matches()) {
          author = addressMatcher.group(1);
          address = addressMatcher.group(2);
        } else {
          addressMatcher = SIMPLE_EMAIL_ADDRESS_PATTERN.matcher(creator);
          if (addressMatcher.matches()) {
            address = addressMatcher.group(1);
          }
        }

        String subject = getMetadata(metadata, TikaCoreProperties.TITLE);
        String replyId = getMetadata(metadata, TikaCoreProperties.RELATION);
        String creationDate = getMetadata(metadata, TikaCoreProperties.CREATED);

        String content = _content.toString();
        _emailChars += content.length();

        // If size is greater than say 4x average, skip it. Otherwise we can get
        // some huge emails when a person includes all of the source code for their
        // project.
        if ((_numEmails > 100) && (content.length() > (4 * _emailChars / _numEmails))) {
          _numSkipped += 1;
          return;
        }

        // Need to convert all CRLF & raw linefeeds into \n sequences, so our file format is
        // correct.
        // We do the same for tabs, so that it's easy to parse the result.
        content = content.replaceAll("\r\n", "\\\\n");
        content = content.replaceAll("[\r\n]", "\\\\n");
        content = content.replaceAll("\t", "\\\\t");

        Tuple tuple =
            new Tuple(messageId, author, address, subject, creationDate, replyId, content);
        functionCall.getOutputCollector().add(tuple);
      } catch (Exception e) {
        LOGGER.error("Exception parsing email: " + e.getMessage());
      } catch (NoClassDefFoundError e) {
        // This will happen when we have an embedded object (multi-part email) which
        // needs parsing support we don't include.
        LOGGER.error("Exception parsing email due to missing class: " + e.getMessage());
      }
    }