private void scan(ByteArrayInputStream in, String path, SVNDirEntry dirEntry) { try { Metadata metadata = new Metadata(); metadata.set(Metadata.RESOURCE_NAME_KEY, path); // The following code part is from an proposal of the Authors of // Tika: // https://issues.apache.org/jira/browse/TIKA-232 TikaConfig config = TikaConfig.getDefaultConfig(); // without a // delegate // parser Parser parser = new AutoDetectParser(config); DefaultHandler handler = new BodyContentHandler(); parser.parse(in, handler, metadata); getDocument().addTokenizedField(FieldNames.CONTENTS, handler.toString()); } catch (Exception e) { LOGGER.error("We had an exception " + path + " (r" + dirEntry.getRevision() + ")", e); } finally { try { in.close(); } catch (Exception e) { LOGGER.error("We had an exception " + path + " (r" + dirEntry.getRevision() + ")", e); } } }
@Override protected ParseContext buildParseContext( Metadata metadata, String targetMimeType, TransformationOptions options) { ParseContext context = super.buildParseContext(metadata, targetMimeType, options); boolean recurse = includeContents; if (options.getIncludeEmbedded() != null) { recurse = options.getIncludeEmbedded(); } if (recurse) { // Use an auto detect parser to handle the contents if (tikaConfig == null) { tikaConfig = TikaConfig.getDefaultConfig(); } context.set(Parser.class, new AutoDetectParser(tikaConfig)); } return context; }
private class FileEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor { private int count = 0; private final TikaConfig config = TikaConfig.getDefaultConfig(); public boolean shouldParseEmbedded(Metadata metadata) { return true; } public void parseEmbedded( InputStream inputStream, ContentHandler contentHandler, Metadata metadata, boolean outputHtml) throws SAXException, IOException { String name = metadata.get(Metadata.RESOURCE_NAME_KEY); if (name == null) { name = "file" + count++; } MediaType contentType = detector.detect(inputStream, metadata); if (name.indexOf('.') == -1 && contentType != null) { try { name += config.getMimeRepository().forName(contentType.toString()).getExtension(); } catch (MimeTypeException e) { e.printStackTrace(); } } String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID); if (relID != null && !name.startsWith(relID)) { name = relID + "_" + name; } File outputFile = new File(extractDir, name); File parent = outputFile.getParentFile(); if (!parent.exists()) { if (!parent.mkdirs()) { throw new IOException("unable to create directory \"" + parent + "\""); } } System.out.println("Extracting '" + name + "' (" + contentType + ") to " + outputFile); FileOutputStream os = null; try { os = new FileOutputStream(outputFile); if (inputStream instanceof TikaInputStream) { TikaInputStream tin = (TikaInputStream) inputStream; if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) { POIFSFileSystem fs = new POIFSFileSystem(); copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot()); fs.writeFilesystem(os); } else { IOUtils.copy(inputStream, os); } } else { IOUtils.copy(inputStream, os); } } catch (Exception e) { logger.warn("Ignoring unexpected exception trying to save embedded file " + name, e); } finally { if (os != null) { os.close(); } } } protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir) throws IOException { for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) { if (entry instanceof DirectoryEntry) { // Need to recurse DirectoryEntry newDir = destDir.createDirectory(entry.getName()); copy((DirectoryEntry) entry, newDir); } else { // Copy entry InputStream contents = new DocumentInputStream((DocumentEntry) entry); try { destDir.createDocument(entry.getName(), contents); } finally { contents.close(); } } } } }