public void parse( InputStream stream, ContentHandler baseHandler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // As we don't know which of the metadata or the content // we'll hit first, catch the endDocument call initially EndDocumentShieldingContentHandler handler = new EndDocumentShieldingContentHandler(baseHandler); // Process the file in turn ZipInputStream zip = new ZipInputStream(stream); ZipEntry entry = zip.getNextEntry(); while (entry != null) { if (entry.getName().equals("mimetype")) { String type = IOUtils.toString(zip, "UTF-8"); metadata.set(Metadata.CONTENT_TYPE, type); } else if (entry.getName().equals("meta.xml")) { meta.parse(zip, new DefaultHandler(), metadata, context); } else if (entry.getName().endsWith("content.xml")) { content.parse(zip, handler, metadata, context); } entry = zip.getNextEntry(); } // Only now call the end document if (handler.getEndDocumentWasCalled()) { handler.reallyEndDocument(); } }
private void scan(ByteArrayInputStream in, String path, SVNDirEntry dirEntry) { try { Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, path); // The following code part is from an proposal of the Authors of // Tika: // https://issues.apache.org/jira/browse/TIKA-232 TikaConfig config = TikaConfig.getDefaultConfig(); // without a // delegate // parser Parser parser = new AutoDetectParser(config); DefaultHandler handler = new BodyContentHandler(); parser.parse(in, handler, metadata); getDocument().addTokenizedField(FieldNames.CONTENTS, handler.toString()); } catch (Exception e) { LOGGER.error("We had an exception " + path + " (r" + dirEntry.getRevision() + ")", e); } finally { try { in.close(); } catch (Exception e) { LOGGER.error("We had an exception " + path + " (r" + dirEntry.getRevision() + ")", e); } } }
@Test public void testRarParsing() throws Exception { Parser parser = new AutoDetectParser(); // Should auto-detect! ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try (InputStream stream = RarParserTest.class.getResourceAsStream("/test-documents/test-documents.rar")) { parser.parse(stream, handler, metadata, recursingContext); } assertEquals("application/x-rar-compressed", metadata.get(Metadata.CONTENT_TYPE)); String content = handler.toString(); assertContains("test-documents/testEXCEL.xls", content); assertContains("Sample Excel Worksheet", content); assertContains("test-documents/testHTML.html", content); assertContains("Test Indexation Html", content); assertContains("test-documents/testOpenOffice2.odt", content); assertContains("This is a sample Open Office document", content); assertContains("test-documents/testPDF.pdf", content); assertContains("Apache Tika", content); assertContains("test-documents/testPPT.ppt", content); assertContains("Sample Powerpoint Slide", content); assertContains("test-documents/testRTF.rtf", content); assertContains("indexation Word", content); assertContains("test-documents/testTXT.txt", content); assertContains("Test d'indexation de Txt", content); assertContains("test-documents/testWORD.doc", content); assertContains("This is a sample Microsoft Word Document", content); assertContains("test-documents/testXML.xml", content); assertContains("Rida Benjelloun", content); }
@Override protected void processSubDataEntity( MultiValueHashMap<String, Object> subDataEntityInformation, Metadata metadata, ContentHandler handler2use4recursiveCall, ParseContext context) throws Exception { URLName urlNameWithPassword = (URLName) subDataEntityInformation.getFirst("urlNameWithPassword"); String strMessageId = (String) subDataEntityInformation.getFirst("Message-ID"); String strMessageFolder = (String) subDataEntityInformation.getFirst("folder"); String strEntityId = ImapURLStreamProvider.getEntityId(strMessageFolder, strMessageId); // wir setzten die hier schon mal - die Daten haben wir in einem prefetching-Schritt schon // effizient geladen. Wenn diese hier schon im // Metadata-Objekt stehen, werden sie von der addFirstMetadata nicht nochmal geladen metadata.set(Metadata.SOURCE, urlNameWithPassword.toString()); metadata.set(IncrementalCrawlingHistory.dataEntityId, strEntityId); metadata.set( IncrementalCrawlingHistory.dataEntityContentFingerprint, ImapURLStreamProvider.getDataEntityContentFingerprint(strEntityId)); URLName urlNameWithoutPassword = new URLName( urlNameWithPassword.getProtocol(), urlNameWithPassword.getHost(), urlNameWithPassword.getPort(), urlNameWithPassword.getFile(), urlNameWithPassword.getUsername(), ""); metadata.set(Metadata.RESOURCE_NAME_KEY, urlNameWithoutPassword.toString()); if (strMessageId == null) metadata.set("Content-Type", DatasourceMediaTypes.IMAPFOLDER.toString()); else metadata.set("Content-Type", "message/rfc822"); metadata = URLStreamProvider.getURLStreamProvider4Protocol(urlNameWithPassword.getProtocol()) .addFirstMetadata(urlNameWithPassword, metadata, context); InputStream stream = URLStreamProvider.getURLStreamProvider(urlNameWithPassword) .getStream(urlNameWithPassword, metadata, context); try { if (m_leech == null) m_leech = new Leech(); // hier nimmt der dann bei einer message hoffentlich den Tika RFC822Parser Parser parser = m_leech.getParser(); parser.parse(stream, handler2use4recursiveCall, metadata, context); } finally { if (stream != null) stream.close(); } }
public void testJPEG() throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG.jpg"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); assertEquals("100", metadata.get(TIFF.IMAGE_WIDTH)); for (String name : metadata.names()) { logger.trace("JPEG-- " + name + "=" + metadata.get(name)); } }
public void testPNGIPTC() throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/png"); InputStream stream = getClass().getResourceAsStream("/test-documents/testPNG_IPTC.png"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); for (String name : metadata.names()) { logger.trace("PNG-- " + name + "=" + metadata.get(name)); } assertEquals("100", metadata.get(TIFF.IMAGE_WIDTH)); assertEquals("Cat in a garden", metadata.get(IPTC.HEADLINE)); }
public void testJPEGCustomXmp() throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_IPTC_EXT.jpg"); ArrayList<Property> passthroughXmpProperties = new ArrayList<Property>(2); passthroughXmpProperties.add(Property.internalText("XMP-custom:Text")); passthroughXmpProperties.add(Property.internalText("XMP-custom:TextML")); Parser passthroughParser = new ExiftoolImageParser(null, passthroughXmpProperties); passthroughParser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); assertEquals("customTextField", metadata.get("XMP-custom:Text")); assertEquals("customMultilineField", metadata.get("XMP-custom:TextML")); }
private void manageDetails(final GetItemResponse response, final DataHandler stream) { InputStream is = null; ContentHandler contenthandler = new BodyContentHandler(); Metadata metadata = new Metadata(); // metadata.set(Metadata.RESOURCE_NAME_KEY, f.getName()); Parser parser = new AutoDetectParser(); ParseContext context = new ParseContext(); try { is = stream.getInputStream(); parser.parse(is, contenthandler, metadata, context); is.close(); is.reset(); } catch (IOException e) { e.printStackTrace(); } catch (SAXException e) { e.printStackTrace(); } catch (TikaException e) { e.printStackTrace(); } String contentAuthorValue = metadata.get(Metadata.AUTHOR); String contentAuthorKey = currentProperties.getProperty(KpeopleLabel.getCorePropertiesAuthor()); if (contentAuthorValue != null) { eventResult.setDetail(contentAuthorKey, contentAuthorValue); } String contentCreationDateValue = metadata.get(Metadata.CREATION_DATE); String contentCreationDateKey = currentProperties.getProperty(KpeopleLabel.getCorePropertiesCreationDate()); if (contentCreationDateValue != null) { eventResult.setDetail(contentCreationDateKey, contentCreationDateValue); } String contentKeywordsValue = metadata.get(Metadata.KEYWORDS); String contentKeywordsKey = currentProperties.getProperty(KpeopleLabel.getCorePropertiesKeywords()); if (contentKeywordsValue != null) { eventResult.setDetail(contentKeywordsKey, contentKeywordsValue); } String[] names = metadata.names(); /* * for (int i = 0; i < names.length; i++) { * System.out.println(names[i]); } */ }
/** Tests that the ParseContext parser is correctly fired for all the embedded entries. */ @Test public void testEmbedded() throws Exception { Parser parser = new AutoDetectParser(); // Should auto-detect! ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); try (InputStream stream = RarParserTest.class.getResourceAsStream("/test-documents/test-documents.rar")) { parser.parse(stream, handler, metadata, trackingContext); } // Should have found all 9 documents, but not the directory assertEquals(9, tracker.filenames.size()); assertEquals(9, tracker.mediatypes.size()); assertEquals(9, tracker.modifiedAts.size()); // Should have names but not content types, as rar doesn't // store the content types assertEquals("test-documents/testEXCEL.xls", tracker.filenames.get(0)); assertEquals("test-documents/testHTML.html", tracker.filenames.get(1)); assertEquals("test-documents/testOpenOffice2.odt", tracker.filenames.get(2)); assertEquals("test-documents/testPDF.pdf", tracker.filenames.get(3)); assertEquals("test-documents/testPPT.ppt", tracker.filenames.get(4)); assertEquals("test-documents/testRTF.rtf", tracker.filenames.get(5)); assertEquals("test-documents/testTXT.txt", tracker.filenames.get(6)); assertEquals("test-documents/testWORD.doc", tracker.filenames.get(7)); assertEquals("test-documents/testXML.xml", tracker.filenames.get(8)); for (String type : tracker.mediatypes) { assertNull(type); } for (String crt : tracker.createdAts) { assertNull(crt); } for (String mod : tracker.modifiedAts) { assertNotNull(mod); assertTrue("Modified at " + mod, mod.startsWith("20")); } // Should have filenames in the content string String content = handler.toString(); assertContains("test-documents/testHTML.html", content); assertContains("test-documents/testEXCEL.xls", content); assertContains("test-documents/testOpenOffice2.odt", content); assertContains("test-documents/testPDF.pdf", content); assertContains("test-documents/testPPT.ppt", content); assertContains("test-documents/testRTF.rtf", content); assertContains("test-documents/testTXT.txt", content); assertContains("test-documents/testWORD.doc", content); assertContains("test-documents/testXML.xml", content); }
public void testTIFFIPTC() throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/tiff"); InputStream stream = getClass().getResourceAsStream("/test-documents/testTIFF_IPTC.tif"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); for (String name : metadata.names()) { logger.trace("TIFF-- " + name + "=" + metadata.get(name)); } List<String> iptcKeywords = Arrays.asList(metadata.getValues(IPTC.KEYWORDS)); assertTrue(iptcKeywords.contains("garden")); assertTrue(iptcKeywords.contains("cat")); assertEquals("Cat in a garden", metadata.get(IPTC.HEADLINE)); }
@Override protected boolean doProcess(Record record, InputStream inputStream) { Parser parser = detectParser(record); if (parser == null) { return false; } ParseContext parseContext = new ParseContext(); parseContext.set(Locale.class, locale); Metadata metadata = new Metadata(); for (Entry<String, Object> entry : record.getFields().entries()) { metadata.add(entry.getKey(), entry.getValue().toString()); } SolrContentHandler handler = solrContentHandlerFactory.createSolrContentHandler(metadata, solrParams, schema); try { inputStream = TikaInputStream.get(inputStream); ContentHandler parsingHandler = handler; // String xpathExpr = "/xhtml:html/xhtml:body/xhtml:div/descendant:node()"; if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); parsingHandler = new MatchingContentHandler(parsingHandler, matcher); } try { parser.parse(inputStream, parsingHandler, metadata, parseContext); } catch (IOException e) { throw new MorphlineRuntimeException("Cannot parse", e); } catch (SAXException e) { throw new MorphlineRuntimeException("Cannot parse", e); } catch (TikaException e) { throw new MorphlineRuntimeException("Cannot parse", e); } } finally { if (inputStream != null) { Closeables.closeQuietly(inputStream); } } SolrInputDocument doc = handler.newDocument(); LOG.debug("solr doc: {}", doc); Record outputRecord = toRecord(doc); return getChild().process(outputRecord); }
/** Asks Tika to translate the contents into HTML */ private void generateHTML(Parser p, RenderingContext context) { ContentReader contentReader = context.makeContentReader(); // Setup things to parse with StringWriter sw = new StringWriter(); ContentHandler handler = buildContentHandler(sw, context); // Tell Tika what we're dealing with Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, contentReader.getMimetype()); metadata.set( Metadata.RESOURCE_NAME_KEY, nodeService.getProperty(context.getSourceNode(), ContentModel.PROP_NAME).toString()); // Our parse context needs to extract images ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, new TikaImageExtractingParser(context)); // Parse try { p.parse(contentReader.getContentInputStream(), handler, metadata, parseContext); } catch (Exception e) { throw new RenditionServiceException("Tika HTML Conversion Failed", e); } // As a string String html = sw.toString(); // If we're doing body-only, remove all the html namespaces // that will otherwise clutter up the document boolean bodyOnly = context.getParamWithDefault(PARAM_BODY_CONTENTS_ONLY, false); if (bodyOnly) { html = html.replaceAll("<\\?xml.*?\\?>", ""); html = html.replaceAll("<p xmlns=\"http://www.w3.org/1999/xhtml\"", "<p"); html = html.replaceAll("<h(\\d) xmlns=\"http://www.w3.org/1999/xhtml\"", "<h\\1"); html = html.replaceAll("<div xmlns=\"http://www.w3.org/1999/xhtml\"", "<div"); html = html.replaceAll("<table xmlns=\"http://www.w3.org/1999/xhtml\"", "<table"); html = html.replaceAll(" ", ""); } // Save it ContentWriter contentWriter = context.makeContentWriter(); contentWriter.setMimetype("text/html"); contentWriter.putContent(html); }
public void setBinaryContent(byte[] data) { InputStream inputStream = new ByteArrayInputStream(data); ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); try { TransformerHandler handler = getTransformerHandler(outputStream, DEFAULT_OUTPUT_FORMAT, DEFAULT_ENCODING); AUTO_DETECT_PARSER.parse(inputStream, handler, new Metadata(), context); // Hacking the following line to remove Tika's inserted DocType String htmlContent = new String(outputStream.toByteArray(), DEFAULT_ENCODING) .replace("http://www.w3.org/1999/xhtml", ""); setHtml(htmlContent); } catch (Exception e) { logger.error("Error parsing file", e); } }
public void process(InputStream input, OutputStream output, Metadata metadata) throws Exception { Parser p = parser; if (fork) { p = new ForkParser(TikaCLI.class.getClassLoader(), p); } ContentHandler handler = getContentHandler(output, metadata); p.parse(input, handler, metadata, context); // fix for TIKA-596: if a parser doesn't generate // XHTML output, the lack of an output document prevents // metadata from being output: this fixes that if (handler instanceof NoDocumentMetHandler) { NoDocumentMetHandler metHandler = (NoDocumentMetHandler) handler; if (!metHandler.metOutput()) { metHandler.endDocument(); } } }
// TIKA-1600: Test that null pointer doesn't break parsing. @Test public void testNullStylesInODTFooter() throws Exception { Parser parser = new OpenDocumentParser(); try (InputStream input = ODFParserTest.class.getResourceAsStream("/test-documents/testODT-TIKA-6000.odt")) { Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); parser.parse(input, handler, metadata, new ParseContext()); assertEquals("application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE)); String content = handler.toString(); assertContains("Utilisation de ce document", content); assertContains("Copyright and License", content); assertContains("Changer la langue", content); assertContains("La page d’accueil permet de faire une recherche simple", content); } }
protected static Metadata extractMetadata( InputStream inputStream, Metadata metadata, Parser parser) throws IOException { if (metadata == null) { metadata = new Metadata(); } ParseContext parserContext = new ParseContext(); parserContext.set(Parser.class, parser); ContentHandler contentHandler = new WriteOutContentHandler(new DummyWriter()); try { parser.parse(inputStream, contentHandler, metadata, parserContext); } catch (Exception e) { Throwable throwable = ExceptionUtils.getRootCause(e); if ((throwable instanceof CryptographyException) || (throwable instanceof EncryptedDocumentException) || (throwable instanceof UnsupportedZipFeatureException)) { if (_log.isWarnEnabled()) { _log.warn("Unable to extract metadata from an encrypted file"); } } else if (e instanceof TikaException) { if (_log.isWarnEnabled()) { _log.warn("Unable to extract metadata"); } } else { _log.error(e, e); } throw new IOException(e); } // Remove potential security risks metadata.remove(XMPDM.ABS_PEAK_AUDIO_FILE_PATH.getName()); metadata.remove(XMPDM.RELATIVE_PEAK_AUDIO_FILE_PATH.getName()); return metadata; }
private void testBasicOCR(String resource, String[] nonOCRContains, int numMetadatas) throws Exception { TesseractOCRConfig config = new TesseractOCRConfig(); Parser parser = new RecursiveParserWrapper( new AutoDetectParser(), new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1)); PDFParserConfig pdfConfig = new PDFParserConfig(); pdfConfig.setExtractInlineImages(true); ParseContext parseContext = new ParseContext(); parseContext.set(TesseractOCRConfig.class, config); parseContext.set(Parser.class, parser); parseContext.set(PDFParserConfig.class, pdfConfig); try (InputStream stream = TesseractOCRParserTest.class.getResourceAsStream(resource)) { parser.parse(stream, new DefaultHandler(), new Metadata(), parseContext); } List<Metadata> metadataList = ((RecursiveParserWrapper) parser).getMetadata(); assertEquals(numMetadatas, metadataList.size()); StringBuilder contents = new StringBuilder(); for (Metadata m : metadataList) { contents.append(m.get(RecursiveParserWrapper.TIKA_CONTENT)); } if (canRun()) { if (resource.substring(resource.lastIndexOf('.'), resource.length()).equals(".jpg")) { assertTrue(contents.toString().contains("Apache")); } else { assertTrue(contents.toString().contains("Happy New Year 2003!")); } } for (String needle : nonOCRContains) { assertContains(needle, contents.toString()); } assertTrue(metadataList.get(0).names().length > 10); assertTrue(metadataList.get(1).names().length > 10); // test at least one value assertEquals("deflate", metadataList.get(1).get("Compression CompressionTypeName")); }
/** @param args */ public static void main(String[] args) { // String fileLocation = "G:/asas/album/song.mp3"; String fileLocation = "C:\\Users\\Public\\Music\\Sample Music\\Kalimba.mp3"; try { InputStream input = new FileInputStream(new File(fileLocation)); ContentHandler handler = new DefaultHandler(); Metadata metadata = new Metadata(); Parser parser = new Mp3Parser(); ParseContext parseCtx = new ParseContext(); parser.parse(input, handler, metadata, parseCtx); input.close(); // List all metadata String[] metadataNames = metadata.names(); for (String name : metadataNames) { System.out.println(name + ": " + metadata.get(name)); } // Retrieve the necessary info from metadata // Names - title, xmpDM:artist etc. - mentioned below may differ // based System.out.println("----------------------------------------------"); System.out.println("Title: " + metadata.get("title")); System.out.println("Artists: " + metadata.get("xmpDM:artist")); System.out.println("Composer : " + metadata.get("xmpDM:composer")); System.out.println("Genre : " + metadata.get("xmpDM:genre")); System.out.println("Album : " + metadata.get("xmpDM:album")); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } catch (SAXException e) { e.printStackTrace(); } catch (TikaException e) { e.printStackTrace(); } }
private Metadata getMetadataFromTika(Product product) throws MetExtractionException { try { File file = getProductFile(product); FileInputStream inputStream = new FileInputStream(file); org.apache.tika.metadata.Metadata tikaMetadata = new org.apache.tika.metadata.Metadata(); Parser parser = new AutoDetectParser(); parser.parse(inputStream, new DefaultHandler(), tikaMetadata, new ParseContext()); return transform(tikaMetadata); } catch (FileNotFoundException e) { throw new MetExtractionException("Unable to find file: Reason: " + e.getMessage()); } catch (TikaException e) { throw new MetExtractionException("Unable to parse the document: Reason: " + e.getMessage()); } catch (SAXException e) { throw new MetExtractionException( " Unable to process the SAX events : Reason: " + e.getMessage()); } catch (IOException e) { throw new MetExtractionException( "Unable to read the document stream: Reason: " + e.getMessage()); } }
@Test public void testOO3() throws Exception { for (Parser parser : getParsers()) { try (InputStream input = ODFParserTest.class.getResourceAsStream("/test-documents/testODFwithOOo3.odt")) { Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); parser.parse(input, handler, metadata, new ParseContext()); assertEquals( "application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE)); String content = handler.toString(); assertContains("Tika is part of the Lucene project.", content); assertContains("Solr", content); assertContains("one embedded", content); assertContains("Rectangle Title", content); assertContains("a blue background and dark border", content); } } }
@Override public DocumentWrap crawl(String uriroot, File file) throws CrawlException { DocumentWrap document = new DocumentWrap(); try { openFile(file); ContentHandler textHandler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); Parser parser; if (file.getName().toLowerCase().endsWith("x")) parser = new OOXMLParser(); else parser = new OfficeParser(); parser.parse(getFileStream(), textHandler, metadata, parseContext); document.setAuthor(metadata.get(Metadata.AUTHOR)); document.setSummary(metadata.get(Metadata.COMMENTS)); document.setContent(textHandler.toString(), bStoreBody); document.setSize((int) file.length()); document.setId(file.getCanonicalPath()); if (uriroot != null) document.setURL(getUrl(uriroot, file)); } catch (FileNotFoundException e) { throw new CrawlException("File not found: " + file, e); } catch (IOException e) { throw new CrawlException("File: " + file, e); } catch (Exception e) { throw new CrawlException("File: " + file, e); } finally { closeFile(); } return document; }
public DocumentWrap crawl(String uriroot, File file) throws CrawlException { DocumentWrap document = new DocumentWrap(); try { openFile(file); ContentHandler textHandler = new BodyContentHandler(); Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); Parser parser = new PDFParser(); parser.parse(getFileStream(), textHandler, metadata, parseContext); // Setup the document document.setContent(textHandler.toString(), bStoreBody); document.setSize((int) file.length()); document.setType("application/pdf"); document.setAuthor(metadata.get(Metadata.AUTHOR)); document.setName(metadata.get(Metadata.TITLE)); document.setSummary(metadata.get(Metadata.SUBJECT)); document.setAttribute("keywords", metadata.get(Metadata.KEYWORDS)); document.setId(file.getCanonicalPath()); if (uriroot != null) document.setURL(getUrl(uriroot, file)); } catch (FileNotFoundException e) { throw new CrawlException("File not found: " + file, e); } catch (IOException e) { throw new CrawlException("File: " + file, e); } catch (Exception e) { throw new CrawlException("File: " + file, e); } finally { closeFile(); } return document; }
public static void enrichDocumentWithFileContents( LocalDocument doc, String fieldPrefix, InputStream stream, Parser parser, boolean addMetaData, boolean addLanguage) throws IOException, SAXException, TikaException { Metadata metadata = new Metadata(); ParseContext parseContext = new ParseContext(); parseContext.set(Parser.class, parser); StringWriter textData = new StringWriter(); parser.parse(stream, new BodyContentHandler(textData), metadata, parseContext); addTextToDocument(doc, fieldPrefix, textData); if (addMetaData) { addMetadataToDocument(doc, fieldPrefix, metadata); } if (addLanguage) { addLanguageToDocument(doc, fieldPrefix, textData.toString()); } }
/** * Common implementation -- take an input stream and return a ConvertedDoc; * * @param input stream for raw file * @param doc raw file * @return converted doc * @throws IOException if underlying Tika parser/writer had an IO problem, an parser problem, or * MAX_TEXT_SIZE is reached. */ @Override protected ConvertedDocument conversionImplementation(InputStream input, java.io.File doc) throws IOException { Metadata metadata = new Metadata(); BodyContentHandler handler = new BodyContentHandler(maxBuffer); try { parser.parse(input, handler, metadata, ctx); } catch (NoClassDefFoundError classErr) { throw new IOException("Unable to parse content due to Tika misconfiguration", classErr); } catch (Exception xerr) { throw new IOException("Unable to parse content", xerr); } finally { input.close(); } ConvertedDocument textdoc = new ConvertedDocument(doc); textdoc.addTitle(metadata.get(TikaCoreProperties.TITLE)); textdoc.setEncoding(metadata.get(Metadata.CONTENT_ENCODING)); textdoc.addCreateDate(metadata.getDate(TikaCoreProperties.CREATED)); textdoc.addAuthor(metadata.get(TikaCoreProperties.CREATOR)); // v1.5: until this version this blank line reducer was in place. // Using Java6 it appeared to cause StackOverflow when it encountered a document hundreds of // \n in a row. // Eg.., a Spreadsheet doc converted to text may have thousands of empty lines following the // last data row. // TextUtils.reduce_line_breaks(txt) String t = handler.toString(); if (t != null) { if (textdoc.filename != null && FileUtility.isSpreadsheet(textdoc.filename)) { textdoc.setText(t.trim()); } else { textdoc.setText(TextUtils.reduce_line_breaks(t)); } } return textdoc; }
public void testJPEGIPTC() throws Exception { Metadata metadata = new Metadata(); metadata.set(Metadata.CONTENT_TYPE, "image/jpeg"); InputStream stream = getClass().getResourceAsStream("/test-documents/testJPEG_IPTC_EXT.jpg"); parser.parse(stream, new DefaultHandler(), metadata, new ParseContext()); assertEquals("Washington", metadata.get(IPTC.CITY)); assertEquals("United States", metadata.get(IPTC.COUNTRY)); assertEquals("US", metadata.get(IPTC.COUNTRY_CODE)); assertEquals( "A stream bank in Rock Creek Park Washington DC during a photo bike tour with ASPP DC/South chapter.", metadata.get(IPTC.DESCRIPTION)); assertEquals( "A stream bank in Rock Creek Park Washington DC during a photo bike tour with ASPP DC/South chapter.", metadata.get(Metadata.DESCRIPTION)); assertEquals("Rock Creek Park", metadata.get(IPTC.HEADLINE)); assertEquals("Downstream", metadata.get(Metadata.TITLE)); assertEquals("intellectual genre", metadata.get(IPTC.INTELLECTUAL_GENRE)); List<String> iptcKeywords = Arrays.asList(metadata.getValues(IPTC.KEYWORDS)); assertTrue(iptcKeywords.contains("stream")); assertTrue(iptcKeywords.contains("park")); assertTrue(iptcKeywords.contains("bank")); List<String> tikaKeywords = Arrays.asList(metadata.getValues(Metadata.KEYWORDS)); assertTrue(Arrays.toString(tikaKeywords.toArray()).contains("stream")); assertTrue(Arrays.toString(tikaKeywords.toArray()).contains("park")); assertTrue(Arrays.toString(tikaKeywords.toArray()).contains("bank")); assertEquals("DC", metadata.get(IPTC.PROVINCE_OR_STATE)); List<String> iptcSceneCode = Arrays.asList(metadata.getValues(IPTC.SCENE_CODE)); assertEquals(2, iptcSceneCode.size()); assertTrue(Arrays.toString(iptcSceneCode.toArray()).contains("iptc scene 1")); assertTrue(Arrays.toString(iptcSceneCode.toArray()).contains("iptc scene 2")); List<String> iptcSubjectCode = Arrays.asList(metadata.getValues(IPTC.SUBJECT_CODE)); assertEquals(2, iptcSubjectCode.size()); assertTrue(Arrays.toString(iptcSubjectCode.toArray()).contains("iptc subject code 1")); assertTrue(Arrays.toString(iptcSubjectCode.toArray()).contains("iptc subject code 2")); assertEquals("Rock Creek Park", metadata.get(IPTC.SUBLOCATION)); GregorianCalendar calendar = new GregorianCalendar(); calendar.set(Calendar.YEAR, 2011); calendar.set(Calendar.MONTH, 7); calendar.set(Calendar.DATE, 31); calendar.set(Calendar.HOUR_OF_DAY, 12); calendar.set(Calendar.MINUTE, 0); calendar.set(Calendar.SECOND, 0); calendar.set(Calendar.MILLISECOND, 0); calendar.setTimeZone(TimeZone.getTimeZone("UTC")); assertEquals(calendar.getTime(), metadata.getDate(IPTC.DATE_CREATED)); assertEquals("Ray Gauss II", metadata.get(IPTC.DESCRIPTION_WRITER)); assertEquals("instructions", metadata.get(IPTC.INSTRUCTIONS)); assertEquals("job identifier", metadata.get(IPTC.JOB_ID)); assertEquals("Downstream", metadata.get(IPTC.TITLE)); assertTrue(metadata.get(IPTC.COPYRIGHT_NOTICE).contains("Ray Gauss II")); List<String> creators = Arrays.asList(metadata.getValues(IPTC.CREATOR)); assertTrue(Arrays.toString(creators.toArray()).contains("Ray Gauss II")); assertEquals("DAM Architect", metadata.get(IPTC.CREATORS_JOB_TITLE)); assertEquals("provider", metadata.get(IPTC.CREDIT_LINE)); assertEquals("rights usage terms", metadata.get(IPTC.RIGHTS_USAGE_TERMS)); assertEquals("source", metadata.get(IPTC.SOURCE)); assertEquals("1234 Some Road", metadata.get(IPTC.CONTACT_INFO_ADDRESS)); assertEquals("Atlanta", metadata.get(IPTC.CONTACT_INFO_CITY)); assertEquals("US", metadata.get(IPTC.CONTACT_INFO_COUNTRY)); List<String> ciWorkEmails = Arrays.asList(metadata.getValues(IPTC.CONTACT_INFO_EMAIL)); // Photoshop does not support true multi-value here assertTrue(Arrays.toString(ciWorkEmails.toArray()).contains("*****@*****.**")); assertTrue(Arrays.toString(ciWorkEmails.toArray()).contains("*****@*****.**")); List<String> ciWorkTels = Arrays.asList(metadata.getValues(IPTC.CONTACT_INFO_PHONE)); // Photoshop does not support true multi-value here assertTrue(Arrays.toString(ciWorkTels.toArray()).contains("555-1234")); assertTrue(Arrays.toString(ciWorkTels.toArray()).contains("555-4321")); assertEquals("30339", metadata.get(IPTC.CONTACT_INFO_POSTAL_CODE)); assertEquals("GA", metadata.get(IPTC.CONTACT_INFO_STATE_PROVINCE)); List<String> ciWorkUrls = Arrays.asList(metadata.getValues(IPTC.CONTACT_INFO_WEB_URL)); // Photoshop does not support true multi-value here assertTrue(Arrays.toString(ciWorkUrls.toArray()).contains("http://alfresco.com")); assertTrue(Arrays.toString(ciWorkUrls.toArray()).contains("http://example.com")); assertEquals("rocky 1 and rocky 2 are big", metadata.get(IPTC.ADDITIONAL_MODEL_INFO)); List<String> orgCodes = Arrays.asList(metadata.getValues(IPTC.ORGANISATION_CODE)); assertEquals(2, orgCodes.size()); assertEquals("ASPP", orgCodes.get(0)); assertEquals("OTHER_ORG", orgCodes.get(1)); // List<String> cvTerms = Arrays.asList(metadata.getValues(IPTC.CONTROLLED_VOCABULARY_TERM)); List<String> modelAges = Arrays.asList(metadata.getValues(IPTC.MODEL_AGE)); assertEquals(2, modelAges.size()); assertEquals("1000", modelAges.get(0)); assertEquals("1001", modelAges.get(1)); List<String> orgNames = Arrays.asList(metadata.getValues(IPTC.ORGANISATION_NAME)); assertEquals(2, orgNames.size()); assertEquals("ASPP", orgNames.get(0)); assertEquals("Other Org", orgNames.get(1)); List<String> peopleShown = Arrays.asList(metadata.getValues(IPTC.PERSON)); assertEquals(2, peopleShown.size()); assertEquals("rocky 1", peopleShown.get(0)); assertEquals("rocky 2", peopleShown.get(1)); assertEquals( "http://cv.iptc.org/newscodes/digitalsourcetype/digitalCapture", metadata.get(IPTC.DIGITAL_SOURCE_TYPE)); assertEquals("Photo Bike Tour", metadata.get(IPTC.EVENT)); assertEquals("RGAUSS", metadata.get(IPTC.IMAGE_SUPPLIER_ID)); assertEquals("Ray Gauss II", metadata.get(IPTC.IMAGE_SUPPLIER_NAME)); assertEquals("supplier image ID", metadata.get(IPTC.IMAGE_SUPPLIER_IMAGE_ID)); assertEquals("3456", metadata.get(IPTC.MAX_AVAIL_HEIGHT)); assertEquals("5184", metadata.get(IPTC.MAX_AVAIL_WIDTH)); assertEquals("1.2.0", metadata.get(IPTC.PLUS_VERSION)); List<String> copyrightOwnerIds = Arrays.asList(metadata.getValues(IPTC.COPYRIGHT_OWNER_ID)); assertEquals(1, copyrightOwnerIds.size()); assertEquals("RGAUSS", copyrightOwnerIds.get(0)); // assertEquals("", copyrightOwnerIds.get(1)); // TODO: Get ExifTool to preserve empty values List<String> copyrightOwnerNames = Arrays.asList(metadata.getValues(IPTC.COPYRIGHT_OWNER_NAME)); assertEquals(2, copyrightOwnerNames.size()); assertEquals("Ray Gauss II", copyrightOwnerNames.get(0)); assertEquals("GG", copyrightOwnerNames.get(1)); List<String> imageCreatorIds = Arrays.asList(metadata.getValues(IPTC.IMAGE_CREATOR_ID)); assertEquals(1, imageCreatorIds.size()); assertEquals("RGAUSS", imageCreatorIds.get(0)); // assertEquals("", imageCreatorIds.get(1)); // TODO: Get ExifTool to preserve empty values assertTrue(metadata.isMultiValued(IPTC.IMAGE_CREATOR_NAME)); List<String> imageCreatorNames = Arrays.asList(metadata.getValues(IPTC.IMAGE_CREATOR_NAME)); assertEquals(2, imageCreatorNames.size()); assertEquals("Ray Gauss II", imageCreatorNames.get(0)); assertEquals("GG", imageCreatorNames.get(1)); List<String> licensorIds = Arrays.asList(metadata.getValues(IPTC.LICENSOR_ID)); assertEquals("RGAUSS", licensorIds.get(0)); assertTrue(metadata.isMultiValued(IPTC.LICENSOR_NAME)); List<String> licensorNames = Arrays.asList(metadata.getValues(IPTC.LICENSOR_NAME)); assertEquals(2, licensorNames.size()); assertEquals("Ray Gauss II", licensorNames.get(0)); assertEquals("GG", licensorNames.get(1)); // Photoshop does not support licensor addresses, cities, or countries List<String> licensorEmails = Arrays.asList(metadata.getValues(IPTC.LICENSOR_EMAIL)); assertEquals("*****@*****.**", licensorEmails.get(0)); // assertEquals("", licensorEmails.get(1)); // TODO: Get ExifTool to preserve empty values List<String> licensorTel1 = Arrays.asList(metadata.getValues(IPTC.LICENSOR_TELEPHONE_1)); assertEquals("555-5555", licensorTel1.get(0)); // assertEquals("", licensorTel1.get(1)); // TODO: Get ExifTool to preserve empty values List<String> licensorTel2 = Arrays.asList(metadata.getValues(IPTC.LICENSOR_TELEPHONE_2)); assertEquals("555-4444", licensorTel2.get(0)); // assertEquals("", licensorTel2.get(1)); // TODO: Get ExifTool to preserve empty values List<String> licensorUrls = Arrays.asList(metadata.getValues(IPTC.LICENSOR_URL)); assertEquals("http://rgauss.com", licensorUrls.get(0)); // assertEquals("", licensorUrls.get(1)); // TODO: Get ExifTool to preserve empty values assertEquals("Age Unknown", metadata.get(IPTC.MINOR_MODEL_AGE_DISCLOSURE)); List<String> modelReleaseIds = Arrays.asList(metadata.getValues(IPTC.MODEL_RELEASE_ID)); assertEquals("model release id 1", modelReleaseIds.get(0)); assertEquals("model release id 2", modelReleaseIds.get(1)); assertEquals("Not Applicable", metadata.get(IPTC.MODEL_RELEASE_STATUS)); List<String> propertyReleaseIds = Arrays.asList(metadata.getValues(IPTC.PROPERTY_RELEASE_ID)); assertEquals("prop release id 1", propertyReleaseIds.get(0)); assertEquals("prop release id 2", propertyReleaseIds.get(1)); assertEquals("Not Applicable", metadata.get(IPTC.PROPERTY_RELEASE_STATUS)); List<String> aoCopyright = Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_COPYRIGHT_NOTICE)); assertEquals("Ray Gauss II", aoCopyright.get(0)); // assertEquals("", aoCopyright.get(1)); // TODO: Get ExifTool to preserve empty values // assertEquals("", aoCopyright.get(2)); // TODO: Get ExifTool to preserve empty values List<String> aoCreator = Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_CREATOR)); assertEquals("Mother Nature", aoCreator.get(0)); assertEquals("Man", aoCreator.get(1)); assertEquals("Mother Nature", aoCreator.get(2)); List<String> aoDateCreated = Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_DATE_CREATED)); assertEquals("1890:01:01", aoDateCreated.get(0)); // assertEquals("", aoDateCreated.get(1)); // TODO: Get ExifTool to preserve empty values assertEquals("1901:02:01", aoDateCreated.get(1)); // assertEquals("", aoDateCreated.get(2)); // TODO: Get ExifTool to preserve empty values List<String> aoSource = Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_SOURCE)); assertEquals("National Park Service", aoSource.get(0)); // assertEquals("", aoSource.get(1)); // TODO: Get ExifTool to preserve empty values // assertEquals("", aoSource.get(2)); // TODO: Get ExifTool to preserve empty values List<String> aoSourceInventoryNum = Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_SOURCE_INVENTORY_NUMBER)); assertEquals("123456", aoSourceInventoryNum.get(0)); // assertEquals("", aoSourceInventoryNum.get(1)); // TODO: Get ExifTool to preserve empty values assertEquals( "654321", aoSourceInventoryNum.get( 1)); // This should be index 2, TODO: Get ExifTool to preserve empty values List<String> aoSourceTitles = Arrays.asList(metadata.getValues(IPTC.ARTWORK_OR_OBJECT_DETAIL_TITLE)); assertEquals("Rock Creek Stream Bank", aoSourceTitles.get(0)); assertEquals("Pollution", aoSourceTitles.get(1)); assertEquals("Some Tree", aoSourceTitles.get(2)); List<String> locationShownCity = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_CITY)); assertEquals("Washington", locationShownCity.get(0)); // assertEquals("", locationShownCity.get(1)); // TODO: Get ExifTool to preserve empty values List<String> locationShownCountryCode = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_COUNTRY_CODE)); assertEquals("US", locationShownCountryCode.get(0)); // assertEquals("", locationShownCountryCode.get(1)); // TODO: Get ExifTool to preserve empty // values List<String> locationShownCountryName = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_COUNTRY_NAME)); assertEquals("United States", locationShownCountryName.get(0)); // assertEquals("", locationShownCountryName.get(1)); // TODO: Get ExifTool to preserve empty // values List<String> locationShownState = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_PROVINCE_OR_STATE)); assertEquals("D.C.", locationShownState.get(0)); // assertEquals("", locationShownState.get(1)); // TODO: Get ExifTool to preserve empty values List<String> locationShownSublocation = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_SUBLOCATION)); assertEquals("Rock Creek Park Sub", locationShownSublocation.get(0)); assertEquals("Stream Section", locationShownSublocation.get(1)); List<String> locationShownWorldRegion = Arrays.asList(metadata.getValues(IPTC.LOCATION_SHOWN_WORLD_REGION)); assertEquals("North America", locationShownWorldRegion.get(0)); // assertEquals("", locationShownWorldRegion.get(1)); // TODO: Get ExifTool to preserve empty // values assertEquals("Washington", metadata.get(IPTC.LOCATION_CREATED_CITY)); assertEquals("US", metadata.get(IPTC.LOCATION_CREATED_COUNTRY_CODE)); assertEquals("United States", metadata.get(IPTC.LOCATION_CREATED_COUNTRY_NAME)); assertEquals("D.C.", metadata.get(IPTC.LOCATION_CREATED_PROVINCE_OR_STATE)); assertEquals("Rock Creek Park", metadata.get(IPTC.LOCATION_CREATED_SUBLOCATION)); assertEquals("North America", metadata.get(IPTC.LOCATION_CREATED_WORLD_REGION)); assertTrue(IPTC.REGISTRY_ENTRY_CREATED_ORGANISATION_ID.isMultiValuePermitted()); assertTrue(metadata.isMultiValued(IPTC.REGISTRY_ENTRY_CREATED_ORGANISATION_ID)); List<String> registryEntryOrgIds = Arrays.asList(metadata.getValues(IPTC.REGISTRY_ENTRY_CREATED_ORGANISATION_ID)); assertEquals(2, registryEntryOrgIds.size()); assertEquals("PLUS", registryEntryOrgIds.get(0)); // assertEquals("", registryEntryOrgIds.get(1)); // TODO: Get ExifTool to preserve empty values assertEquals( "ORG 2", registryEntryOrgIds.get( 1)); // This should be index 2, TODO: Get ExifTool to preserve empty values assertTrue(IPTC.REGISTRY_ENTRY_CREATED_ORGANISATION_ID.isMultiValuePermitted()); assertTrue(metadata.isMultiValued(IPTC.REGISTRY_ENTRY_CREATED_ITEM_ID)); List<String> registryEntryItemIds = Arrays.asList(metadata.getValues(IPTC.REGISTRY_ENTRY_CREATED_ITEM_ID)); assertEquals(registryEntryItemIds.size(), 3); assertEquals("100-ABC-ABC-555", registryEntryItemIds.get(0)); assertEquals("11223344", registryEntryItemIds.get(1)); assertEquals("55667788", registryEntryItemIds.get(2)); }
private static void setup() throws Exception { conf = NutchConfiguration.create(); conf.setBoolean("parser.html.form.use_action", true); utils = new DOMContentUtils(conf); TikaParser tikaParser = new TikaParser(); tikaParser.setConf(conf); Parser parser = tikaParser.getTikaConfig().getParser("text/html"); for (int i = 0; i < testPages.length; i++) { Metadata tikamd = new Metadata(); HTMLDocumentImpl doc = new HTMLDocumentImpl(); doc.setErrorChecking(false); DocumentFragment root = doc.createDocumentFragment(); DOMBuilder domhandler = new DOMBuilder(doc, root); ParseContext context = new ParseContext(); // to add once available in Tika // context.set(HtmlMapper.class, IdentityHtmlMapper.INSTANCE); try { parser.parse( new ByteArrayInputStream(testPages[i].getBytes()), domhandler, tikamd, context); testBaseHrefURLs[i] = new URL(testBaseHrefs[i]); } catch (Exception e) { e.printStackTrace(); fail("caught exception: " + e); } testDOMs[i] = root; LSSerializerImpl lsi = new LSSerializerImpl(); System.out.println("input " + i + ": '" + testPages[i] + "'"); System.out.println("output " + i + ": '" + lsi.writeToString(root) + "'"); } answerOutlinks = new Outlink[][] { // 0 { new Outlink("http://www.nutch.org", "anchor"), }, // 1 { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/bot.html", "bots"), }, // 2 { new Outlink("http://www.nutch.org/", "separate this"), new Outlink("http://www.nutch.org/docs/ok", "from this"), }, // 3 { new Outlink("http://www.nutch.org/", "home"), new Outlink("http://www.nutch.org/docs/1", "1"), new Outlink("http://www.nutch.org/docs/2", "2"), }, // 4 { new Outlink("http://www.nutch.org/frames/top.html", ""), new Outlink("http://www.nutch.org/frames/left.html", ""), new Outlink("http://www.nutch.org/frames/invalid.html", ""), new Outlink("http://www.nutch.org/frames/right.html", ""), }, // 5 { new Outlink("http://www.nutch.org/maps/logo.gif", ""), new Outlink("http://www.nutch.org/index.html", ""), new Outlink("http://www.nutch.org/maps/#bottom", ""), new Outlink("http://www.nutch.org/bot.html", ""), new Outlink("http://www.nutch.org/docs/index.html", "") }, // 6 { new Outlink("http://www.nutch.org/index.html", "whitespace test"), }, // 7 {}, // 8 { new Outlink("http://www.nutch.org/dummy.jsp", "test2"), }, // 9 {}, // 10 { new Outlink("http://www.nutch.org/;x", "anchor1"), new Outlink("http://www.nutch.org/g;x", "anchor2"), new Outlink("http://www.nutch.org/g;x?y#s", "anchor3") }, // 11 { // this is tricky - see RFC3986 section 5.4.1 example 7 new Outlink("http://www.nutch.org/g", "anchor1"), new Outlink("http://www.nutch.org/g?y#s", "anchor2"), new Outlink("http://www.nutch.org/;something?y=1", "anchor3"), new Outlink("http://www.nutch.org/;something?y=1#s", "anchor4"), new Outlink("http://www.nutch.org/;something?y=1;somethingelse", "anchor5") } }; }
public ParseResult getParse(Content content) { String mimeType = content.getContentType(); URL base; try { base = new URL(content.getBaseUrl()); } catch (MalformedURLException e) { return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf()); } // get the right parser using the mime type as a clue Parser parser = tikaConfig.getParser(MediaType.parse(mimeType)); byte[] raw = content.getContent(); if (parser == null) { String message = "Can't retrieve Tika parser for mime-type " + mimeType; LOG.error(message); return new ParseStatus(ParseStatus.FAILED, message) .getEmptyParseResult(content.getUrl(), getConf()); } LOG.debug("Using Tika parser " + parser.getClass().getName() + " for mime-type " + mimeType); Metadata tikamd = new Metadata(); HTMLDocumentImpl doc = new HTMLDocumentImpl(); doc.setErrorChecking(false); DocumentFragment root = doc.createDocumentFragment(); DOMBuilder domhandler = new DOMBuilder(doc, root); ParseContext context = new ParseContext(); try { parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd, context); } catch (Exception e) { LOG.error("Error parsing " + content.getUrl(), e); return new ParseStatus(ParseStatus.FAILED, e.getMessage()) .getEmptyParseResult(content.getUrl(), getConf()); } HTMLMetaTags metaTags = new HTMLMetaTags(); String text = ""; String title = ""; Outlink[] outlinks = new Outlink[0]; org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata(); // we have converted the sax events generated by Tika into a DOM object // so we can now use the usual HTML resources from Nutch // get meta directives HTMLMetaProcessor.getMetaTags(metaTags, root, base); if (LOG.isTraceEnabled()) { LOG.trace("Meta tags for " + base + ": " + metaTags.toString()); } // check meta directives if (!metaTags.getNoIndex()) { // okay to index StringBuffer sb = new StringBuffer(); if (LOG.isTraceEnabled()) { LOG.trace("Getting text..."); } utils.getText(sb, root); // extract text text = sb.toString(); sb.setLength(0); if (LOG.isTraceEnabled()) { LOG.trace("Getting title..."); } utils.getTitle(sb, root); // extract title title = sb.toString().trim(); } if (!metaTags.getNoFollow()) { // okay to follow links ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks URL baseTag = utils.getBase(root); if (LOG.isTraceEnabled()) { LOG.trace("Getting links..."); } utils.getOutlinks(baseTag != null ? baseTag : base, l, root); outlinks = l.toArray(new Outlink[l.size()]); if (LOG.isTraceEnabled()) { LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl()); } } // populate Nutch metadata with Tika metadata String[] TikaMDNames = tikamd.names(); for (String tikaMDName : TikaMDNames) { if (tikaMDName.equalsIgnoreCase(Metadata.TITLE)) continue; // TODO what if multivalued? nutchMetadata.add(tikaMDName, tikamd.get(tikaMDName)); } // no outlinks? try OutlinkExtractor e.g works for mime types where no // explicit markup for anchors if (outlinks.length == 0) { outlinks = OutlinkExtractor.getOutlinks(text, getConf()); } ParseStatus status = new ParseStatus(ParseStatus.SUCCESS); if (metaTags.getRefresh()) { status.setMinorCode(ParseStatus.SUCCESS_REDIRECT); status.setArgs( new String[] { metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime()) }); } ParseData parseData = new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata); ParseResult parseResult = ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData)); // run filters on parse ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root); if (metaTags.getNoCache()) { // not okay to cache for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse) entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy); } return filteredParse; }
@Override public void load( SolrQueryRequest req, SolrQueryResponse rsp, ContentStream stream, UpdateRequestProcessor processor) throws Exception { Parser parser = null; String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null); if (streamType != null) { // Cache? Parsers are lightweight to construct and thread-safe, so I'm told MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT)); parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt); } else { parser = autoDetectParser; } if (parser != null) { Metadata metadata = new Metadata(); // If you specify the resource name (the filename, roughly) with this parameter, // then Tika can make use of it in guessing the appropriate MIME type: String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null); if (resourceName != null) { metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName); } // Provide stream's content type as hint for auto detection if (stream.getContentType() != null) { metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType()); } InputStream inputStream = null; try { inputStream = stream.getStream(); metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName()); metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo()); metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize())); metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType()); // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType()); if (charset != null) { metadata.add(HttpHeaders.CONTENT_ENCODING, charset); } String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION); boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false); SolrContentHandler handler = factory.createSolrContentHandler(metadata, params, req.getSchema()); ContentHandler parsingHandler = handler; StringWriter writer = null; BaseMarkupSerializer serializer = null; if (extractOnly == true) { String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml"); writer = new StringWriter(); if (extractFormat.equals(TEXT_FORMAT)) { serializer = new TextSerializer(); serializer.setOutputCharStream(writer); serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true)); } else { serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true)); } if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); serializer .startDocument(); // The MatchingContentHandler does not invoke startDocument. See // http://tika.markmail.org/message/kknu3hw7argwiqin parsingHandler = new MatchingContentHandler(serializer, matcher); } else { parsingHandler = serializer; } } else if (xpathExpr != null) { Matcher matcher = PARSER.parse(xpathExpr); parsingHandler = new MatchingContentHandler(handler, matcher); } // else leave it as is try { // potentially use a wrapper handler for parsing, but we still need the SolrContentHandler // for getting the document. ParseContext context = parseContextConfig.create(); context.set(Parser.class, parser); context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE); // Password handling RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider(); String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE); if (pwMapFile != null && pwMapFile.length() > 0) { InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile); if (is != null) { log.debug("Password file supplied: " + pwMapFile); epp.parse(is); } } context.set(PasswordProvider.class, epp); String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD); if (resourcePassword != null) { epp.setExplicitPassword(resourcePassword); log.debug("Literal password supplied for file " + resourceName); } parser.parse(inputStream, parsingHandler, metadata, context); } catch (TikaException e) { if (ignoreTikaException) log.warn( new StringBuilder("skip extracting text due to ") .append(e.getLocalizedMessage()) .append(". metadata=") .append(metadata.toString()) .toString()); else throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } if (extractOnly == false) { addDoc(handler); } else { // serializer is not null, so we need to call endDoc on it if using xpath if (xpathExpr != null) { serializer.endDocument(); } rsp.add(stream.getName(), writer.toString()); writer.close(); String[] names = metadata.names(); NamedList metadataNL = new NamedList(); for (int i = 0; i < names.length; i++) { String[] vals = metadata.getValues(names[i]); metadataNL.add(names[i], vals); } rsp.add(stream.getName() + "_metadata", metadataNL); } } catch (SAXException e) { throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e); } finally { IOUtils.closeQuietly(inputStream); } } else { throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Stream type of " + streamType + " didn't match any known parsers. Please supply the " + ExtractingParams.STREAM_TYPE + " parameter."); } }
@Test public void testOO2() throws Exception { for (Parser parser : getParsers()) { try (InputStream input = ODFParserTest.class.getResourceAsStream("/test-documents/testOpenOffice2.odt")) { Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); parser.parse(input, handler, metadata, new ParseContext()); assertEquals( "application/vnd.oasis.opendocument.text", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("en-US", metadata.get(Metadata.LANGUAGE)); assertEquals("PT1M7S", metadata.get(Metadata.EDIT_TIME)); assertEquals( "NeoOffice/2.2$Unix OpenOffice.org_project/680m18$Build-9161", metadata.get("generator")); // Check date metadata, both old-style and new-style assertEquals("2007-09-14T11:07:10", metadata.get(TikaCoreProperties.MODIFIED)); assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.MODIFIED)); assertEquals("2007-09-14T11:07:10", metadata.get(Metadata.DATE)); assertEquals("2007-09-14T11:06:08", metadata.get(TikaCoreProperties.CREATED)); assertEquals("2007-09-14T11:06:08", metadata.get(Metadata.CREATION_DATE)); // Check the document statistics assertEquals("1", metadata.get(Office.PAGE_COUNT)); assertEquals("1", metadata.get(Office.PARAGRAPH_COUNT)); assertEquals("14", metadata.get(Office.WORD_COUNT)); assertEquals("78", metadata.get(Office.CHARACTER_COUNT)); assertEquals("0", metadata.get(Office.TABLE_COUNT)); assertEquals("0", metadata.get(Office.OBJECT_COUNT)); assertEquals("0", metadata.get(Office.IMAGE_COUNT)); // Check the Tika-1.0 style document statistics assertEquals("1", metadata.get(Metadata.PAGE_COUNT)); assertEquals("1", metadata.get(Metadata.PARAGRAPH_COUNT)); assertEquals("14", metadata.get(Metadata.WORD_COUNT)); assertEquals("78", metadata.get(Metadata.CHARACTER_COUNT)); assertEquals("0", metadata.get(Metadata.TABLE_COUNT)); assertEquals("0", metadata.get(Metadata.OBJECT_COUNT)); assertEquals("0", metadata.get(Metadata.IMAGE_COUNT)); // Check the very old style statistics (these will be removed shortly) assertEquals("0", metadata.get("nbTab")); assertEquals("0", metadata.get("nbObject")); assertEquals("0", metadata.get("nbImg")); assertEquals("1", metadata.get("nbPage")); assertEquals("1", metadata.get("nbPara")); assertEquals("14", metadata.get("nbWord")); assertEquals("78", metadata.get("nbCharacter")); // Custom metadata tags present but without values assertEquals(null, metadata.get("custom:Info 1")); assertEquals(null, metadata.get("custom:Info 2")); assertEquals(null, metadata.get("custom:Info 3")); assertEquals(null, metadata.get("custom:Info 4")); String content = handler.toString(); assertTrue( content.contains( "This is a sample Open Office document," + " written in NeoOffice 2.2.1 for the Mac.")); } } }
@Override public void operate(FlowProcess flowProcess, FunctionCall<NullContext> functionCall) { String email = functionCall.getArguments().getString("email"); _numEmails += 1; Metadata metadata = new Metadata(); try { InputStream stream = new ByteArrayInputStream(email.getBytes("UTF-8")); _parser.parse(stream, _handler, metadata, new ParseContext()); // _content now has all of the body text, and metadata has the header info. String messageId = getMetadata(metadata, TikaCoreProperties.IDENTIFIER); String author = ""; String address = ""; String creator = getMetadata(metadata, TikaCoreProperties.CREATOR); Matcher addressMatcher = FULL_EMAIL_ADDRESS_PATTERN.matcher(creator); if (addressMatcher.matches()) { author = addressMatcher.group(1); address = addressMatcher.group(2); } else { addressMatcher = SIMPLE_EMAIL_ADDRESS_PATTERN.matcher(creator); if (addressMatcher.matches()) { address = addressMatcher.group(1); } } String subject = getMetadata(metadata, TikaCoreProperties.TITLE); String replyId = getMetadata(metadata, TikaCoreProperties.RELATION); String creationDate = getMetadata(metadata, TikaCoreProperties.CREATED); String content = _content.toString(); _emailChars += content.length(); // If size is greater than say 4x average, skip it. Otherwise we can get // some huge emails when a person includes all of the source code for their // project. if ((_numEmails > 100) && (content.length() > (4 * _emailChars / _numEmails))) { _numSkipped += 1; return; } // Need to convert all CRLF & raw linefeeds into \n sequences, so our file format is // correct. // We do the same for tabs, so that it's easy to parse the result. content = content.replaceAll("\r\n", "\\\\n"); content = content.replaceAll("[\r\n]", "\\\\n"); content = content.replaceAll("\t", "\\\\t"); Tuple tuple = new Tuple(messageId, author, address, subject, creationDate, replyId, content); functionCall.getOutputCollector().add(tuple); } catch (Exception e) { LOGGER.error("Exception parsing email: " + e.getMessage()); } catch (NoClassDefFoundError e) { // This will happen when we have an embedded object (multi-part email) which // needs parsing support we don't include. LOGGER.error("Exception parsing email due to missing class: " + e.getMessage()); } }