/** * We don't currently support the .xlsb file format (an OOXML container with binary blobs), but we * shouldn't break on these files either (TIKA-826) */ @Test public void testExcelXLSB() throws Exception { Detector detector = new DefaultDetector(); AutoDetectParser parser = new AutoDetectParser(); Metadata m = new Metadata(); m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb"); // Should be detected correctly MediaType type; try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) { type = detector.detect(input, m); assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString()); } // OfficeParser won't handle it assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type)); // OOXMLParser won't handle it assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type)); // AutoDetectParser doesn't break on it try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) { ContentHandler handler = new BodyContentHandler(-1); ParseContext context = new ParseContext(); context.set(Locale.class, Locale.US); parser.parse(input, handler, m, context); String content = handler.toString(); assertEquals("", content); } }
private void parseImage(Image image, File file) throws Exception { try { // Detects the file type BodyContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); FileInputStream inputStream = new FileInputStream(file); ParseContext parseContext = new ParseContext(); // Parser AutoDetectParser parser = new AutoDetectParser(); parser.parse(inputStream, handler, metadata, parseContext); // Image field setting String date; if (metadata.getDate(metadata.ORIGINAL_DATE) != null) { date = metadata.getDate(metadata.ORIGINAL_DATE).toString(); } else if (metadata.getDate(TikaCoreProperties.CREATED) != null) { date = metadata.getDate(TikaCoreProperties.CREATED).toString(); } else if (metadata.getDate(DublinCore.CREATED) != null) { date = metadata.getDate(DublinCore.CREATED).toString(); } else if (metadata.getDate(TikaCoreProperties.METADATA_DATE) != null) { date = metadata.getDate(TikaCoreProperties.METADATA_DATE).toString(); } else if (metadata.getDate(DublinCore.MODIFIED) != null) { date = metadata.getDate(DublinCore.MODIFIED).toString(); } else { // Current date+time metadata.set(Metadata.DATE, new Date()); date = metadata.get(Metadata.DATE); } image.setLongitude(metadata.get(Geographic.LONGITUDE)); image.setLatitude(metadata.get(Geographic.LATITUDE)); ImageOperations.setMetadataParsingFinished(); if (date != null) { image.setDate(date.toString()); } else { image.setDate(null); } image.setLongitude(image.getLongitude()); image.setLatitude(image.getLatitude()); aPII.reverseGeocode(image); ImageOperations.setReverseGeocodeFinished(); ImageOperations iO = new ImageOperations(); iO.doOCR(image, file); ImageOperations.setOcrFinished(); } catch (IOException e) { System.out.println(e.getMessage()); } catch (TikaException te) { System.out.println(te.getMessage()); } catch (SAXException se) { System.out.println(se.getMessage()); } catch (InterruptedException ie) { System.out.println(ie.getMessage()); } catch (IM4JavaException je) { je.printStackTrace(); } }
/** * Fetch metadata from stream with already detected metadata * * @param type Mime type of data * @param is Data imput stream * @return */ public static Metadata detect(final MediaType type, InputStream is) { AutoDetectParser parser = new AutoDetectParser((input, metadata) -> type); Metadata metadata = new Metadata(); try { parser.parse(is, new DefaultHandler(), metadata); } catch (Exception e) { throw new RuntimeException(e); } return metadata; }
private JSONObject extractTika(String contents) { JSONObject jObj = (JSONObject) JSONSerializer.toJSON(contents); if (jObj.containsKey("_source")) { JSONObject jObjSource = jObj.getJSONObject("_source"); if (jObjSource.containsKey("raw_content")) { String rawHtml = jObjSource.getString("raw_content"); ByteArrayInputStream bIs = new ByteArrayInputStream(rawHtml.getBytes()); Metadata metadata = new Metadata(); AutoDetectParser adp = new AutoDetectParser(); ContentHandler handler = new BodyContentHandler(10 * 1024 * 1024); try { adp.parse(bIs, handler, metadata); String[] metadataNames = metadata.names(); JSONObject jObjMetadata = new JSONObject(); for (String metadataName : metadataNames) { String[] values = metadata.getValues(metadataName); JSONArray jArray = new JSONArray(); for (String mValue : values) { jArray.add(mValue); } jObjMetadata.accumulate(metadataName, jArray); } // remove empty lines from the text String rawTextAdjusted = handler.toString().replaceAll("(?m)^[ \t]*\r?\n", ""); // detect language LanguageIdentifier li = new LanguageIdentifier(rawTextAdjusted); jObjSource.accumulate("tikametadata", jObjMetadata); jObjSource.accumulate("raw_text", rawTextAdjusted); jObjSource.accumulate("rawtextdetectedlanguage", li.getLanguage()); } catch (Exception e) { LOG.error("Error:", e); ; } } } return jObj; }
/** do NOT make public */ final class TikaImpl { /** subset of parsers for types we support */ private static final Parser PARSERS[] = new Parser[] { // documents new org.apache.tika.parser.html.HtmlParser(), new org.apache.tika.parser.rtf.RTFParser(), new org.apache.tika.parser.pdf.PDFParser(), new org.apache.tika.parser.txt.TXTParser(), new org.apache.tika.parser.microsoft.OfficeParser(), new org.apache.tika.parser.microsoft.OldExcelParser(), new org.apache.tika.parser.microsoft.ooxml.OOXMLParser(), new org.apache.tika.parser.odf.OpenDocumentParser(), new org.apache.tika.parser.iwork.IWorkPackageParser(), new org.apache.tika.parser.xml.DcXMLParser(), }; /** autodetector based on this subset */ private static final AutoDetectParser PARSER_INSTANCE = new AutoDetectParser(PARSERS); /** singleton tika instance */ private static final Tika TIKA_INSTANCE = new Tika(PARSER_INSTANCE.getDetector(), PARSER_INSTANCE); /** parses with tika, throwing any exception hit while parsing the document */ // only package private for testing! static String parse(final byte content[], final Metadata metadata, final int limit) throws TikaException, IOException { // check that its not unprivileged code like a script SecurityManager sm = System.getSecurityManager(); if (sm != null) { sm.checkPermission(new SpecialPermission()); } try { return AccessController.doPrivileged( new PrivilegedExceptionAction<String>() { @Override public String run() throws TikaException, IOException { return TIKA_INSTANCE.parseToString(StreamInput.wrap(content), metadata, limit); } }); } catch (PrivilegedActionException e) { // checked exception from tika: unbox it Throwable cause = e.getCause(); if (cause instanceof TikaException) { throw (TikaException) cause; } else if (cause instanceof IOException) { throw (IOException) cause; } else { throw new AssertionError(cause); } } } }
/* * (non-Javadoc) * @see org.alfresco.repo.rendition.executer.AbstractRenderingEngine#render(org.alfresco.repo.rendition.executer.AbstractRenderingEngine.RenderingContext) */ @Override protected void render(RenderingContext context) { ContentReader contentReader = context.makeContentReader(); String sourceMimeType = contentReader.getMimetype(); // Check that Tika supports the supplied file AutoDetectParser p = new AutoDetectParser(tikaConfig); MediaType sourceMediaType = MediaType.parse(sourceMimeType); if (!p.getParsers().containsKey(sourceMediaType)) { throw new RenditionServiceException( "Source mime type of " + sourceMimeType + " is not supported by Tika for HTML conversions"); } // Make the HTML Version using Tika // This will also extract out any images as found generateHTML(p, context); }
/** Prints all the known media types, aliases and matching parser classes. */ private void displaySupportedTypes() { AutoDetectParser parser = new AutoDetectParser(); MediaTypeRegistry registry = parser.getMediaTypeRegistry(); Map<MediaType, Parser> parsers = parser.getParsers(); for (MediaType type : registry.getTypes()) { System.out.println(type); for (MediaType alias : registry.getAliases(type)) { System.out.println(" alias: " + alias); } MediaType supertype = registry.getSupertype(type); if (supertype != null) { System.out.println(" supertype: " + supertype); } Parser p = parsers.get(type); if (p != null) { System.out.println(" parser: " + p.getClass().getName()); } } }
/** Excel 5 and 95 are older formats, and only get basic support */ @Test public void testExcel95() throws Exception { Detector detector = new DefaultDetector(); AutoDetectParser parser = new AutoDetectParser(); MediaType type; Metadata m; // First try detection of Excel 5 m = new Metadata(); m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls"); try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) { type = detector.detect(input, m); assertEquals("application/vnd.ms-excel", type.toString()); } // Now Excel 95 m = new Metadata(); m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls"); try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) { type = detector.detect(input, m); assertEquals("application/vnd.ms-excel", type.toString()); } // OfficeParser can handle it assertEquals(true, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type)); // OOXMLParser won't handle it assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type)); // Parse the Excel 5 file m = new Metadata(); try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls")) { ContentHandler handler = new BodyContentHandler(-1); ParseContext context = new ParseContext(); context.set(Locale.class, Locale.US); parser.parse(input, handler, m, context); String content = handler.toString(); // Sheet names assertContains("Feuil1", content); assertContains("Feuil3", content); // Text assertContains("Sample Excel", content); assertContains("Number", content); // Numbers assertContains("15", content); assertContains("225", content); // Metadata was also fetched assertEquals("Simple Excel document", m.get(TikaCoreProperties.TITLE)); assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR)); } // Parse the Excel 95 file m = new Metadata(); try (InputStream input = ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls")) { ContentHandler handler = new BodyContentHandler(-1); ParseContext context = new ParseContext(); context.set(Locale.class, Locale.US); parser.parse(input, handler, m, context); String content = handler.toString(); // Sheet name assertContains("Foglio1", content); // Very boring file, no actual text or numbers! // Metadata was also fetched assertEquals(null, m.get(TikaCoreProperties.TITLE)); assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR)); } }
@Override public Metacard transform(InputStream input, String uri) throws IOException, CatalogTransformerException { if (input == null) { throw new CatalogTransformerException("Cannot transform null input."); } MetacardImpl metacard = new MetacardImpl(BasicTypes.BASIC_METACARD); Metadata metadata = new Metadata(); ContentHandler handler = new BodyContentHandler(); AutoDetectParser parser = new AutoDetectParser(); try { parser.parse(input, handler, metadata); String title = metadata.get(TikaCoreProperties.TITLE); if (LOGGER.isDebugEnabled()) { LOGGER.debug("Title: " + title); LOGGER.debug("Creator: " + metadata.get(TikaCoreProperties.CREATOR)); LOGGER.debug("Author: " + metadata.get(Metadata.AUTHOR)); LOGGER.debug("Creation Date: " + metadata.get(TikaCoreProperties.CREATED)); LOGGER.debug("Modified Date: " + metadata.get(TikaCoreProperties.MODIFIED)); LOGGER.debug("Content Type: " + metadata.get(Metadata.CONTENT_TYPE)); // LOGGER.debug("content: " + handler.toString()); // int count = 1; // for (String stringMetadata : metadata.names()) // { // LOGGER.debug("Metadata " + count + " ----> name : " // + stringMetadata + " value : " + metadata.get(stringMetadata)); // count++; // } } // mc.setMetadata(convertNodeToString(getDocument(jaxbDoc))); if (StringUtils.isEmpty(title)) { title = "<No title provided>"; } metacard.setTitle(title); Date date = javax.xml.bind.DatatypeConverter.parseDateTime(metadata.get(TikaCoreProperties.CREATED)) .getTime(); metacard.setCreatedDate(date); date = javax.xml.bind.DatatypeConverter.parseDateTime(metadata.get(TikaCoreProperties.MODIFIED)) .getTime(); metacard.setModifiedDate(date); // metacard.setExpirationDate(getExpirationDate(resource)); // metacard.setEffectiveDate(getEffectiveDate(resource)); // metacard.setLocation(getLocation(resource)); // metacard.setSourceId(getSourceId()); // metacard.setResourceSize(getResourceSize(resource)); if (uri != null) { metacard.setResourceURI(URI.create(uri)); } else { metacard.setResourceURI(null); } } catch (SAXException e) { LOGGER.warn(e); throw new CatalogTransformerException(e); } catch (TikaException e) { LOGGER.warn(e); throw new CatalogTransformerException(e); } return metacard; }