private void scan(ByteArrayInputStream in, String path, SVNDirEntry dirEntry) {
    try {
      Metadata metadata = new Metadata();
      metadata.set(Metadata.RESOURCE_NAME_KEY, path);

      // The following code part is from an proposal of the Authors of
      // Tika:
      // https://issues.apache.org/jira/browse/TIKA-232
      TikaConfig config = TikaConfig.getDefaultConfig(); // without a
      // delegate
      // parser
      Parser parser = new AutoDetectParser(config);
      DefaultHandler handler = new BodyContentHandler();
      parser.parse(in, handler, metadata);
      getDocument().addTokenizedField(FieldNames.CONTENTS, handler.toString());

    } catch (Exception e) {
      LOGGER.error("We had an exception " + path + " (r" + dirEntry.getRevision() + ")", e);
    } finally {
      try {
        in.close();
      } catch (Exception e) {
        LOGGER.error("We had an exception " + path + " (r" + dirEntry.getRevision() + ")", e);
      }
    }
  }
  @Override
  protected ParseContext buildParseContext(
      Metadata metadata, String targetMimeType, TransformationOptions options) {
    ParseContext context = super.buildParseContext(metadata, targetMimeType, options);

    boolean recurse = includeContents;
    if (options.getIncludeEmbedded() != null) {
      recurse = options.getIncludeEmbedded();
    }

    if (recurse) {
      // Use an auto detect parser to handle the contents
      if (tikaConfig == null) {
        tikaConfig = TikaConfig.getDefaultConfig();
      }
      context.set(Parser.class, new AutoDetectParser(tikaConfig));
    }

    return context;
  }
Example #3
0
  private class FileEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor {

    private int count = 0;
    private final TikaConfig config = TikaConfig.getDefaultConfig();

    public boolean shouldParseEmbedded(Metadata metadata) {
      return true;
    }

    public void parseEmbedded(
        InputStream inputStream,
        ContentHandler contentHandler,
        Metadata metadata,
        boolean outputHtml)
        throws SAXException, IOException {
      String name = metadata.get(Metadata.RESOURCE_NAME_KEY);

      if (name == null) {
        name = "file" + count++;
      }

      MediaType contentType = detector.detect(inputStream, metadata);

      if (name.indexOf('.') == -1 && contentType != null) {
        try {
          name += config.getMimeRepository().forName(contentType.toString()).getExtension();
        } catch (MimeTypeException e) {
          e.printStackTrace();
        }
      }

      String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
      if (relID != null && !name.startsWith(relID)) {
        name = relID + "_" + name;
      }

      File outputFile = new File(extractDir, name);
      File parent = outputFile.getParentFile();
      if (!parent.exists()) {
        if (!parent.mkdirs()) {
          throw new IOException("unable to create directory \"" + parent + "\"");
        }
      }
      System.out.println("Extracting '" + name + "' (" + contentType + ") to " + outputFile);

      FileOutputStream os = null;

      try {
        os = new FileOutputStream(outputFile);

        if (inputStream instanceof TikaInputStream) {
          TikaInputStream tin = (TikaInputStream) inputStream;

          if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) {
            POIFSFileSystem fs = new POIFSFileSystem();
            copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
            fs.writeFilesystem(os);
          } else {
            IOUtils.copy(inputStream, os);
          }
        } else {
          IOUtils.copy(inputStream, os);
        }
      } catch (Exception e) {
        logger.warn("Ignoring unexpected exception trying to save embedded file " + name, e);
      } finally {
        if (os != null) {
          os.close();
        }
      }
    }

    protected void copy(DirectoryEntry sourceDir, DirectoryEntry destDir) throws IOException {
      for (org.apache.poi.poifs.filesystem.Entry entry : sourceDir) {
        if (entry instanceof DirectoryEntry) {
          // Need to recurse
          DirectoryEntry newDir = destDir.createDirectory(entry.getName());
          copy((DirectoryEntry) entry, newDir);
        } else {
          // Copy entry
          InputStream contents = new DocumentInputStream((DocumentEntry) entry);
          try {
            destDir.createDocument(entry.getName(), contents);
          } finally {
            contents.close();
          }
        }
      }
    }
  }