Example #1
0
  /**
   * Get the Mime type of an Asset based on its type. If the Asset already has the "content-type"
   * property set, we return that. Otherwise the Apache Tika library is used to do file type
   * detection.
   *
   * @return A string representation of the content type suitable for use in an HTTP header. Eg.
   *     "image/jpeg" for a jpeg image.
   */
  public <T> String getMimeType(Entity entity, T type) {

    Map<String, Object> fileMetadata = AssetUtils.getFileMetadata(entity);
    if (fileMetadata.get(AssetUtils.CONTENT_TYPE) != null) {
      return (String) fileMetadata.get(AssetUtils.CONTENT_TYPE);
    }

    Metadata metadata = new Metadata();
    MediaType mediaType = MediaType.OCTET_STREAM;
    try {
      if (type instanceof byte[]) {

        ByteArrayInputStream bais = new ByteArrayInputStream((byte[]) type);
        mediaType = detector.detect(bais, metadata);
      } else if (type instanceof File) {

        InputStream fis = new BufferedInputStream(new FileInputStream((File) type));
        try {
          mediaType = detector.detect(fis, metadata);
        } finally {
          fis.close();
        }
      } else {
        return mediaType.toString();
      }

      fileMetadata.put(AssetUtils.CONTENT_TYPE, mediaType.toString());
    } catch (IOException e) {
      LOG.error("error detecting mime type", e);
    }

    return mediaType.toString();
  }
  public String getContentType(String fileName) {
    if (Validator.isNull(fileName)) {
      return ContentTypes.APPLICATION_OCTET_STREAM;
    }

    try {
      Metadata metadata = new Metadata();

      metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);

      MediaType mediaType = _detector.detect(null, metadata);

      String contentType = mediaType.toString();

      if (!contentType.contains("tika")) {
        return contentType;
      } else if (_log.isDebugEnabled()) {
        _log.debug("Retrieved invalid content type " + contentType);
      }
    } catch (Exception e) {
      _log.error(e, e);
    }

    return ContentTypes.APPLICATION_OCTET_STREAM;
  }
Example #3
0
  /**
   * We don't currently support the .xlsb file format (an OOXML container with binary blobs), but we
   * shouldn't break on these files either (TIKA-826)
   */
  @Test
  public void testExcelXLSB() throws Exception {
    Detector detector = new DefaultDetector();
    AutoDetectParser parser = new AutoDetectParser();

    Metadata m = new Metadata();
    m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");

    // Should be detected correctly
    MediaType type;
    try (InputStream input =
        ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
      type = detector.detect(input, m);
      assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());
    }

    // OfficeParser won't handle it
    assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));

    // OOXMLParser won't handle it
    assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));

    // AutoDetectParser doesn't break on it
    try (InputStream input =
        ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
      ContentHandler handler = new BodyContentHandler(-1);
      ParseContext context = new ParseContext();
      context.set(Locale.class, Locale.US);
      parser.parse(input, handler, m, context);

      String content = handler.toString();
      assertEquals("", content);
    }
  }
    @Override
    public void parse(
        InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
      // Is it a supported image?
      String filename = metadata.get(Metadata.RESOURCE_NAME_KEY);
      String type = metadata.get(Metadata.CONTENT_TYPE);
      boolean accept = false;

      if (type != null) {
        for (MediaType mt : types) {
          if (mt.toString().equals(type)) {
            accept = true;
          }
        }
      }
      if (filename != null) {
        for (MediaType mt : types) {
          String ext = "." + mt.getSubtype();
          if (filename.endsWith(ext)) {
            accept = true;
          }
        }
      }

      if (!accept) return;

      handleImage(stream, filename, type);
    }
Example #5
0
 /** Returns true if mediaType falls withing the given range (pattern), false otherwise */
 private boolean isMediaTypeMatch(MediaType mediaType, MediaType rangePattern) {
   String WILDCARD = "*";
   String rangePatternType = rangePattern.getType();
   String rangePatternSubtype = rangePattern.getSubtype();
   return (rangePatternType.equals(WILDCARD) || rangePatternType.equals(mediaType.getType()))
       && (rangePatternSubtype.equals(WILDCARD)
           || rangePatternSubtype.equals(mediaType.getSubtype()));
 }
 static {
   SUPPORTED_MIMETYPES = new ArrayList<String>();
   Parser p = new PackageParser();
   for (MediaType mt : p.getSupportedTypes(null)) {
     // Tika can probably do some useful text
     SUPPORTED_MIMETYPES.add(mt.toString());
   }
 }
Example #7
0
  public static String getMimeTypeFromContentType(String contentType) {
    String result = "";
    MediaType mt = MediaType.parse(contentType);
    if (mt != null) {
      result = mt.getType() + "/" + mt.getSubtype();
    }

    return result;
  }
Example #8
0
  public static String getCharsetFromContentType(String contentType) {
    String result = "";
    MediaType mt = MediaType.parse(contentType);
    if (mt != null) {
      String charset = mt.getParameters().get("charset");
      if (charset != null) {
        result = charset;
      }
    }

    return result;
  }
Example #9
0
  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    // Automatically detect the character encoding
    AutoDetectReader reader =
        new AutoDetectReader(
            new CloseShieldInputStream(stream), metadata, context.get(ServiceLoader.class, LOADER));
    try {
      Charset charset = reader.getCharset();
      String previous = metadata.get(Metadata.CONTENT_TYPE);
      MediaType contentType = null;
      if (previous == null || previous.startsWith("text/html")) {
        contentType = new MediaType(MediaType.TEXT_HTML, charset);
      } else if (previous.startsWith("application/xhtml+xml")) {
        contentType = new MediaType(XHTML, charset);
      } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
        contentType = new MediaType(WAP_XHTML, charset);
      } else if (previous.startsWith("application/x-asp")) {
        contentType = new MediaType(X_ASP, charset);
      }
      if (contentType != null) {
        metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
      }
      // deprecated, see TIKA-431
      metadata.set(Metadata.CONTENT_ENCODING, charset.name());

      // Get the HTML mapper from the parse context
      HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper());

      // Parse the HTML document
      org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();

      // Use schema from context or default
      Schema schema = context.get(Schema.class, HTML_SCHEMA);

      // TIKA-528: Reuse share schema to avoid heavy instantiation
      parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
      // TIKA-599: Shared schema is thread-safe only if bogons are ignored
      parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);

      parser.setContentHandler(
          new XHTMLDowngradeHandler(new HtmlHandler(mapper, handler, metadata)));

      parser.parse(reader.asInputSource());
    } finally {
      reader.close();
    }
  }
Example #10
0
/**
 * A more "natural" implementation of an XML parser. Instead of "generating" HTML-like wrapper
 * events and then producing the PCDATA (only - this is the reason why the default XMLParser should
 * be called "embedded"), this parser produces the actual XML start and end document and tag events
 * (that get wrapped by Tika's own {@link org.apache.tika.parser.xml.XMLParser XMLParser}).
 * Furthermore, this parser semi-structures the element's PCDATA text by separating content from
 * different elements by linebreaks, indenting PCDATA content according to the current element's
 * depth, and drops any ("ignorable") character stretches consisting only of spaces.
 *
 * @author Florian Leitner
 */
public class UnembeddedXMLParser extends AbstractParser {
  /** Serial version UID */
  private static final long serialVersionUID = -6028860725229212437L;
  /** Only support XML */
  private static final Set<MediaType> SUPPORTED_TYPES =
      Collections.unmodifiableSet(
          new HashSet<MediaType>(Arrays.asList(MediaType.application("xml"))));

  /** {@inheritDoc} */
  public Set<MediaType> getSupportedTypes(ParseContext context) {
    return SUPPORTED_TYPES;
  }

  /**
   * Parse the input stream with a SAX parser. Wraps the content handler with an {@link
   * org.apache.tika.sax.OfflineContentHandler} to avoid that any namespace lookups are made. In
   * addition, by overriding {@link #getContentHandler(ContentHandler, Metadata, ParseContext)}, it
   * is possible to add additional wrappers.
   *
   * @param stream that should be parsed
   * @param handler that will receive the SAX events
   * @param metadata of current document stream
   * @param context of current parse
   * @throws IOException if the stream cannot be read
   * @throws SAXException if the SAX parsing fails.
   * @throws TikaException if the XML parsing fails.
   */
  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    final TaggedContentHandler tagged = new TaggedContentHandler(handler);
    if (metadata.get(HttpHeaders.CONTENT_TYPE) == null) {
      metadata.set(HttpHeaders.CONTENT_TYPE, "application/xml");
    }
    try {
      context
          .getSAXParser()
          .parse(
              new CloseShieldInputStream(stream),
              new OfflineContentHandler(getContentHandler(tagged, metadata, context)));
    } catch (final SAXException e) {
      tagged.throwIfCauseOf(e);
      throw new TikaException("XML parse error", e);
    }
  }

  /**
   * Return the handler (ie., does nothing). This method can be overridden to add wrap the content
   * handler with additional handlers.
   *
   * @param handler to wrap
   * @param metadata of current document
   * @param context of current parse
   * @return
   */
  protected ContentHandler getContentHandler(
      ContentHandler handler, Metadata metadata, ParseContext context) {
    return handler;
  }
}
 /**
  * Detects the content type of the given input event. Returns <code>application/octet-stream
  * </code> if the type of the event can not be detected.
  *
  * <p>It is legal for the event headers or body to be empty. The detector may read bytes from
  * the start of the body stream to help in type detection.
  *
  * @return detected media type, or <code>application/octet-stream</code>
  */
 private String getMediaType(InputStream in, Metadata metadata, boolean excludeParameters) {
   MediaType mediaType;
   try {
     mediaType = getDetector().detect(in, metadata);
   } catch (IOException e) {
     throw new MorphlineRuntimeException(e);
   }
   String mediaTypeStr = mediaType.toString();
   if (excludeParameters) {
     int i = mediaTypeStr.indexOf(';');
     if (i >= 0) {
       mediaTypeStr = mediaTypeStr.substring(0, i);
     }
   }
   return mediaTypeStr;
 }
Example #12
0
/** Tika Parser for Microsoft Project MPX files (Text based) */
public class MPXParser extends AbstractParser {
  private static final long serialVersionUID = -4791025107910605527L;

  private static List<MediaType> TYPES =
      Arrays.asList(new MediaType[] {MediaType.application("x-project")});

  public Set<MediaType> getSupportedTypes(ParseContext context) {
    return new HashSet<MediaType>(TYPES);
  }

  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, TikaException, SAXException {
    MPXReader reader = new MPXReader();
    ProjectFile project = null;

    try {
      project = reader.read(stream);
    } catch (MPXJException e) {
      throw new TikaException("Error reading MPX file", e);
    }

    // Extract helpful information out
    ProjectFileProcessor.parse(project, handler, metadata, context);
  }
}
  protected void importDataArchive(
      Resource archive, InputStream resourceStream, BatchImportOptions options) {
    BufferedInputStream bufferedResourceStream = null;
    try {
      // Make sure the stream is buffered
      if (resourceStream instanceof BufferedInputStream) {
        bufferedResourceStream = (BufferedInputStream) resourceStream;
      } else {
        bufferedResourceStream = new BufferedInputStream(resourceStream);
      }

      // Buffer up to 100MB, bad things will happen if we bust this buffer.
      // TODO see if there is a buffered stream that will write to a file once the buffer fills up
      bufferedResourceStream.mark(100 * 1024 * 1024);
      final MediaType type = getMediaType(bufferedResourceStream, archive.getFilename());

      if (MT_JAVA_ARCHIVE.equals(type)) {
        final ArchiveInputStream archiveStream = new JarArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MediaType.APPLICATION_ZIP.equals(type)) {
        final ArchiveInputStream archiveStream = new ZipArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_CPIO.equals(type)) {
        final ArchiveInputStream archiveStream = new CpioArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_AR.equals(type)) {
        final ArchiveInputStream archiveStream = new ArArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_TAR.equals(type)) {
        final ArchiveInputStream archiveStream = new TarArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_BZIP2.equals(type)) {
        final CompressorInputStream compressedStream =
            new BZip2CompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else if (MT_GZIP.equals(type)) {
        final CompressorInputStream compressedStream =
            new GzipCompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else if (MT_PACK200.equals(type)) {
        final CompressorInputStream compressedStream =
            new Pack200CompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else if (MT_XZ.equals(type)) {
        final CompressorInputStream compressedStream =
            new XZCompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else {
        throw new RuntimeException("Unrecognized archive media type: " + type);
      }
    } catch (IOException e) {
      throw new RuntimeException("Could not load InputStream for resource: " + archive, e);
    } finally {
      IOUtils.closeQuietly(bufferedResourceStream);
    }
  }
    private TikaImageExtractingParser(RenderingContext renderingContext) {
      this.renderingContext = renderingContext;

      // Our expected types
      types = new HashSet<MediaType>();
      types.add(MediaType.image("bmp"));
      types.add(MediaType.image("gif"));
      types.add(MediaType.image("jpg"));
      types.add(MediaType.image("jpeg"));
      types.add(MediaType.image("png"));
      types.add(MediaType.image("tiff"));

      // Are images going in the same place as the HTML?
      if (renderingContext.getParamWithDefault(PARAM_IMAGES_SAME_FOLDER, false)) {
        RenditionLocation location =
            resolveRenditionLocation(
                renderingContext.getSourceNode(),
                renderingContext.getDefinition(),
                renderingContext.getDestinationNode());
        imgFolder = location.getParentRef();
        if (logger.isDebugEnabled()) {
          logger.debug("Using imgFolder: " + imgFolder);
        }
      }
    }
  public String getContentType(InputStream inputStream, String fileName) {
    if ((inputStream == null) && Validator.isNull(fileName)) {
      return ContentTypes.APPLICATION_OCTET_STREAM;
    }

    String contentType = null;

    try {
      Metadata metadata = new Metadata();

      metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);

      MediaType mediaType = _detector.detect(TikaInputStream.get(inputStream), metadata);

      contentType = mediaType.toString();

      if (contentType.contains("tika")) {
        if (_log.isDebugEnabled()) {
          _log.debug("Retrieved invalid content type " + contentType);
        }

        contentType = getContentType(fileName);
      }

      if (contentType.contains("tika")) {
        if (_log.isDebugEnabled()) {
          _log.debug("Retrieved invalid content type " + contentType);
        }

        contentType = ContentTypes.APPLICATION_OCTET_STREAM;
      }
    } catch (Exception e) {
      _log.error(e, e);

      contentType = ContentTypes.APPLICATION_OCTET_STREAM;
    }

    return contentType;
  }
  /** @return SiteMap/SiteMapIndex given a content type, byte content and the URL of a sitemap */
  public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url)
      throws UnknownFormatException, IOException {
    MediaType mediaType = MediaType.parse(contentType);

    // Octet-stream is the father of all binary types
    while (mediaType != null && !mediaType.equals(MediaType.OCTET_STREAM)) {
      if (XML_MEDIA_TYPES.contains(mediaType)) {
        return processXml(url, content);
      } else if (TEXT_MEDIA_TYPES.contains(mediaType)) {
        return (AbstractSiteMap) processText(url.toString(), content);
      } else if (GZ_MEDIA_TYPES.contains(mediaType)) {
        return processGzip(url, content);
      } else {
        mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check
        // parent
        return parseSiteMap(mediaType.toString(), content, url);
      }
    }

    throw new UnknownFormatException(
        "Can't parse a sitemap with the MediaType of: " + contentType + " (at: " + url + ")");
  }
Example #17
0
  public MediaType detect(TikaInputStream input, Metadata metadata) throws IOException {
    ZipFile zip = new ZipFile(input.getFile());
    for (ZipEntry entry : Collections.list(zip.entries())) {
      // Is it an Open Document file?
      if (entry.getName().equals("mimetype")) {
        InputStream stream = zip.getInputStream(entry);
        try {
          return fromString(IOUtils.toString(stream, "UTF-8"));
        } finally {
          stream.close();
        }
      } else if (entry.getName().equals("_rels/.rels")
          || entry.getName().equals("[Content_Types].xml")) {
        // Office Open XML File
        // As POI to open and investigate it for us
        try {
          OPCPackage pkg = OPCPackage.open(input.getFile().toString());
          input.setOpenContainer(pkg);

          PackageRelationshipCollection core =
              pkg.getRelationshipsByType(ExtractorFactory.CORE_DOCUMENT_REL);
          if (core.size() != 1) {
            throw new IOException(
                "Invalid OOXML Package received - expected 1 core document, found " + core.size());
          }

          // Get the type of the core document part
          PackagePart corePart = pkg.getPart(core.getRelationship(0));
          String coreType = corePart.getContentType();

          // Turn that into the type of the overall document
          String docType = coreType.substring(0, coreType.lastIndexOf('.'));
          return fromString(docType);
        } catch (InvalidFormatException e) {
          throw new IOException("Office Open XML File detected, but corrupted - " + e.getMessage());
        }
      } else if (entry.getName().equals("buildVersionHistory.plist")) {
        // This is an iWork document

        // Reset and ask
        zip.close();
        zip = new ZipFile(input.getFile());
        return IWorkPackageParser.identifyType(zip);
      } else if (entry.getName().equals("META-INF/")) {
        // Java Jar
        return MediaType.application("java-archive");
      }
    }

    return MediaType.APPLICATION_ZIP;
  }
  /**
   * Performs a one time intialization of Tika's Media-Type components and media type collection
   * constants <br>
   * Please note that this is a private static method which is called once per CLASS (not per
   * instance / object)
   */
  private static void initMediaTypes() {
    /* XML media types (and all aliases) */
    XML_MEDIA_TYPES.add(APPLICATION_XML);
    XML_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(APPLICATION_XML));

    /* TEXT media types (and all aliases) */
    TEXT_MEDIA_TYPES.add(TEXT_PLAIN);
    TEXT_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(TEXT_PLAIN));

    /* GZIP media types (and all aliases) */
    MediaType gzipMediaType = MediaType.parse("application/gzip");
    GZ_MEDIA_TYPES.add(gzipMediaType);
    GZ_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(gzipMediaType));
  }
Example #19
0
  private static Metadata tika_parse(File audioFile) {
    Metadata metadata = new Metadata();
    try {
      String filetype = new Tika().detect(audioFile);
      metadata.set("tika.filetype", filetype);
      metadata.set("file.size", Long.toString(audioFile.length()));

      BufferedInputStream inputStream = new BufferedInputStream(new FileInputStream(audioFile));
      new AutoDetectParser().parse(inputStream, new BodyContentHandler(), metadata);
      inputStream.close();

      for (String key : metadata.names()) {
        StringBuilder dataBuilder = new StringBuilder();
        if (metadata.isMultiValued(key)) {
          for (String val : metadata.getValues(key)) {
            if (dataBuilder.length() > 1) {
              dataBuilder.append(", ");
            }
            dataBuilder.append(val);
          }
        } else {
          dataBuilder.append(metadata.get(key));
        }
        metadata.set(key, dataBuilder.toString().trim());
      }

      inputStream = new BufferedInputStream(new FileInputStream(audioFile));
      MediaType media = new DefaultDetector().detect(inputStream, new Metadata());
      metadata.set("media", media.toString());
    } catch (SAXException | IOException | TikaException e) {
      metadata.set(
          "error_tika_parse",
          "tika_parse error processing file (" + audioFile.getName() + "): " + e.getMessage());
    }
    return metadata;
  }
  /*
  If Tesseract is found, test we retrieve the proper number of supporting Parsers.
   */
  @Test
  public void offersTypesIfFound() throws Exception {
    TesseractOCRParser parser = new TesseractOCRParser();
    DefaultParser defaultParser = new DefaultParser();

    ParseContext parseContext = new ParseContext();
    MediaType png = MediaType.image("png");

    // Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG.
    assumeTrue(canRun());

    assertEquals(5, parser.getSupportedTypes(parseContext).size());
    assertTrue(parser.getSupportedTypes(parseContext).contains(png));

    // DefaultParser will now select the TesseractOCRParser.
    assertEquals(
        TesseractOCRParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
  }
  protected Parser getParser(Metadata metadata, ParseContext context) {
    Map<MediaType, Parser> map = getParsers(context);
    MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
    if (type != null) {
      // We always work on the normalised, canonical form
      type = registry.normalize(type);
    }
    while (type != null) {
      // Try finding a parser for the type
      Parser parser = map.get(type);
      if (parser != null) {
        return parser;
      }

      // Failing that, try for the parent of the type
      type = registry.getSupertype(type);
    }
    return fallback;
  }
  /*
   * (non-Javadoc)
   * @see org.alfresco.repo.rendition.executer.AbstractRenderingEngine#render(org.alfresco.repo.rendition.executer.AbstractRenderingEngine.RenderingContext)
   */
  @Override
  protected void render(RenderingContext context) {
    ContentReader contentReader = context.makeContentReader();
    String sourceMimeType = contentReader.getMimetype();

    // Check that Tika supports the supplied file
    AutoDetectParser p = new AutoDetectParser(tikaConfig);
    MediaType sourceMediaType = MediaType.parse(sourceMimeType);
    if (!p.getParsers().containsKey(sourceMediaType)) {
      throw new RenditionServiceException(
          "Source mime type of "
              + sourceMimeType
              + " is not supported by Tika for HTML conversions");
    }

    // Make the HTML Version using Tika
    // This will also extract out any images as found
    generateHTML(p, context);
  }
  /*
  Check that if Tesseract is not found, the TesseractOCRParser claims to not support
  any file types. So, the standard image parser is called instead.
   */
  @Test
  public void offersNoTypesIfNotFound() throws Exception {
    TesseractOCRParser parser = new TesseractOCRParser();
    DefaultParser defaultParser = new DefaultParser();
    MediaType png = MediaType.image("png");

    // With an invalid path, will offer no types
    TesseractOCRConfig invalidConfig = new TesseractOCRConfig();
    invalidConfig.setTesseractPath("/made/up/path");

    ParseContext parseContext = new ParseContext();
    parseContext.set(TesseractOCRConfig.class, invalidConfig);

    // No types offered
    assertEquals(0, parser.getSupportedTypes(parseContext).size());

    // And DefaultParser won't use us
    assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
  }
Example #24
0
 private static Set<MediaType> mediaTypesListFromDomElement(Element node, String tag)
     throws TikaException, IOException {
   Set<MediaType> types = null;
   NodeList children = node.getChildNodes();
   for (int i = 0; i < children.getLength(); i++) {
     Node cNode = children.item(i);
     if (cNode instanceof Element) {
       Element cElement = (Element) cNode;
       if (tag.equals(cElement.getTagName())) {
         String mime = getText(cElement);
         MediaType type = MediaType.parse(mime);
         if (type != null) {
           if (types == null) types = new HashSet<>();
           types.add(type);
         } else {
           throw new TikaException("Invalid media type name: " + mime);
         }
       }
     }
   }
   if (types != null) return types;
   return Collections.emptySet();
 }
Example #25
0
 public MediaTypeFilter() {
   this.mediaTypes =
       MediaType.set(MediaType.TEXT_HTML, MediaType.TEXT_PLAIN, MediaType.APPLICATION_XML);
 }
Example #26
0
/**
 * HTML parser. Uses TagSoup to turn the input document to HTML SAX events, and post-processes the
 * events to produce XHTML and metadata expected by Tika clients.
 */
public class HtmlParser extends AbstractParser {

  /** Serial version UID */
  private static final long serialVersionUID = 7895315240498733128L;

  private static final MediaType XHTML = MediaType.application("xhtml+xml");
  private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
  private static final MediaType X_ASP = MediaType.application("x-asp");

  private static final Set<MediaType> SUPPORTED_TYPES =
      Collections.unmodifiableSet(
          new HashSet<MediaType>(Arrays.asList(MediaType.text("html"), XHTML, WAP_XHTML, X_ASP)));

  private static final ServiceLoader LOADER = new ServiceLoader(HtmlParser.class.getClassLoader());

  /** HTML schema singleton used to amortise the heavy instantiation time. */
  private static final Schema HTML_SCHEMA = new HTMLSchema();

  public Set<MediaType> getSupportedTypes(ParseContext context) {
    return SUPPORTED_TYPES;
  }

  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    // Automatically detect the character encoding
    AutoDetectReader reader =
        new AutoDetectReader(
            new CloseShieldInputStream(stream), metadata, context.get(ServiceLoader.class, LOADER));
    try {
      Charset charset = reader.getCharset();
      String previous = metadata.get(Metadata.CONTENT_TYPE);
      MediaType contentType = null;
      if (previous == null || previous.startsWith("text/html")) {
        contentType = new MediaType(MediaType.TEXT_HTML, charset);
      } else if (previous.startsWith("application/xhtml+xml")) {
        contentType = new MediaType(XHTML, charset);
      } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
        contentType = new MediaType(WAP_XHTML, charset);
      } else if (previous.startsWith("application/x-asp")) {
        contentType = new MediaType(X_ASP, charset);
      }
      if (contentType != null) {
        metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
      }
      // deprecated, see TIKA-431
      metadata.set(Metadata.CONTENT_ENCODING, charset.name());

      // Get the HTML mapper from the parse context
      HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper());

      // Parse the HTML document
      org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();

      // Use schema from context or default
      Schema schema = context.get(Schema.class, HTML_SCHEMA);

      // TIKA-528: Reuse share schema to avoid heavy instantiation
      parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
      // TIKA-599: Shared schema is thread-safe only if bogons are ignored
      parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);

      parser.setContentHandler(
          new XHTMLDowngradeHandler(new HtmlHandler(mapper, handler, metadata)));

      parser.parse(reader.asInputSource());
    } finally {
      reader.close();
    }
  }

  /**
   * Maps "safe" HTML element names to semantic XHTML equivalents. If the given element is unknown
   * or deemed unsafe for inclusion in the parse output, then this method returns <code>null</code>
   * and the element will be ignored but the content inside it is still processed. See the {@link
   * #isDiscardElement(String)} method for a way to discard the entire contents of an element.
   *
   * <p>Subclasses can override this method to customize the default mapping.
   *
   * @param name HTML element name (upper case)
   * @return XHTML element name (lower case), or <code>null</code> if the element is unsafe
   * @since Apache Tika 0.5
   * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This method
   *     will be removed in Tika 1.0.
   */
  protected String mapSafeElement(String name) {
    return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);
  }

  /**
   * Checks whether all content within the given HTML element should be discarded instead of
   * including it in the parse output. Subclasses can override this method to customize the set of
   * discarded elements.
   *
   * @param name HTML element name (upper case)
   * @return <code>true</code> if content inside the named element should be ignored, <code>false
   *     </code> otherwise
   * @since Apache Tika 0.5
   * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This method
   *     will be removed in Tika 1.0.
   */
  protected boolean isDiscardElement(String name) {
    return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);
  }

  /**
   * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This method
   *     will be removed in Tika 1.0.
   */
  public String mapSafeAttribute(String elementName, String attributeName) {
    return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName);
  }

  /**
   * Adapter class that maintains backwards compatibility with the protected HtmlParser methods.
   * Making HtmlParser implement HtmlMapper directly would require those methods to be public, which
   * would break backwards compatibility with subclasses.
   *
   * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This class will
   *     be removed in Tika 1.0.
   */
  private class HtmlParserMapper implements HtmlMapper {
    public String mapSafeElement(String name) {
      return HtmlParser.this.mapSafeElement(name);
    }

    public boolean isDiscardElement(String name) {
      return HtmlParser.this.isDiscardElement(name);
    }

    public String mapSafeAttribute(String elementName, String attributeName) {
      return HtmlParser.this.mapSafeAttribute(elementName, attributeName);
    }
  }
}
  @Override
  public void load(
      SolrQueryRequest req,
      SolrQueryResponse rsp,
      ContentStream stream,
      UpdateRequestProcessor processor)
      throws Exception {
    Parser parser = null;
    String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
    if (streamType != null) {
      // Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
      MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
      parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
    } else {
      parser = autoDetectParser;
    }
    if (parser != null) {
      Metadata metadata = new Metadata();

      // If you specify the resource name (the filename, roughly) with this parameter,
      // then Tika can make use of it in guessing the appropriate MIME type:
      String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
      if (resourceName != null) {
        metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
      }
      // Provide stream's content type as hint for auto detection
      if (stream.getContentType() != null) {
        metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
      }

      InputStream inputStream = null;
      try {
        inputStream = stream.getStream();
        metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
        metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
        metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
        metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
        // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
        String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
        if (charset != null) {
          metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
        }

        String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
        boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
        SolrContentHandler handler =
            factory.createSolrContentHandler(metadata, params, req.getSchema());
        ContentHandler parsingHandler = handler;

        StringWriter writer = null;
        BaseMarkupSerializer serializer = null;
        if (extractOnly == true) {
          String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
          writer = new StringWriter();
          if (extractFormat.equals(TEXT_FORMAT)) {
            serializer = new TextSerializer();
            serializer.setOutputCharStream(writer);
            serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
          } else {
            serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
          }
          if (xpathExpr != null) {
            Matcher matcher = PARSER.parse(xpathExpr);
            serializer
                .startDocument(); // The MatchingContentHandler does not invoke startDocument.  See
                                  // http://tika.markmail.org/message/kknu3hw7argwiqin
            parsingHandler = new MatchingContentHandler(serializer, matcher);
          } else {
            parsingHandler = serializer;
          }
        } else if (xpathExpr != null) {
          Matcher matcher = PARSER.parse(xpathExpr);
          parsingHandler = new MatchingContentHandler(handler, matcher);
        } // else leave it as is

        try {
          // potentially use a wrapper handler for parsing, but we still need the SolrContentHandler
          // for getting the document.
          ParseContext context = parseContextConfig.create();

          context.set(Parser.class, parser);
          context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);

          // Password handling
          RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
          String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
          if (pwMapFile != null && pwMapFile.length() > 0) {
            InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
            if (is != null) {
              log.debug("Password file supplied: " + pwMapFile);
              epp.parse(is);
            }
          }
          context.set(PasswordProvider.class, epp);
          String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
          if (resourcePassword != null) {
            epp.setExplicitPassword(resourcePassword);
            log.debug("Literal password supplied for file " + resourceName);
          }
          parser.parse(inputStream, parsingHandler, metadata, context);
        } catch (TikaException e) {
          if (ignoreTikaException)
            log.warn(
                new StringBuilder("skip extracting text due to ")
                    .append(e.getLocalizedMessage())
                    .append(". metadata=")
                    .append(metadata.toString())
                    .toString());
          else throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        }
        if (extractOnly == false) {
          addDoc(handler);
        } else {
          // serializer is not null, so we need to call endDoc on it if using xpath
          if (xpathExpr != null) {
            serializer.endDocument();
          }
          rsp.add(stream.getName(), writer.toString());
          writer.close();
          String[] names = metadata.names();
          NamedList metadataNL = new NamedList();
          for (int i = 0; i < names.length; i++) {
            String[] vals = metadata.getValues(names[i]);
            metadataNL.add(names[i], vals);
          }
          rsp.add(stream.getName() + "_metadata", metadataNL);
        }
      } catch (SAXException e) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
      } finally {
        IOUtils.closeQuietly(inputStream);
      }
    } else {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST,
          "Stream type of "
              + streamType
              + " didn't match any known parsers.  Please supply the "
              + ExtractingParams.STREAM_TYPE
              + " parameter.");
    }
  }
Example #28
0
    public void parseEmbedded(
        InputStream inputStream,
        ContentHandler contentHandler,
        Metadata metadata,
        boolean outputHtml)
        throws SAXException, IOException {
      String name = metadata.get(Metadata.RESOURCE_NAME_KEY);

      if (name == null) {
        name = "file" + count++;
      }

      MediaType contentType = detector.detect(inputStream, metadata);

      if (name.indexOf('.') == -1 && contentType != null) {
        try {
          name += config.getMimeRepository().forName(contentType.toString()).getExtension();
        } catch (MimeTypeException e) {
          e.printStackTrace();
        }
      }

      String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
      if (relID != null && !name.startsWith(relID)) {
        name = relID + "_" + name;
      }

      File outputFile = new File(extractDir, name);
      File parent = outputFile.getParentFile();
      if (!parent.exists()) {
        if (!parent.mkdirs()) {
          throw new IOException("unable to create directory \"" + parent + "\"");
        }
      }
      System.out.println("Extracting '" + name + "' (" + contentType + ") to " + outputFile);

      FileOutputStream os = null;

      try {
        os = new FileOutputStream(outputFile);

        if (inputStream instanceof TikaInputStream) {
          TikaInputStream tin = (TikaInputStream) inputStream;

          if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) {
            POIFSFileSystem fs = new POIFSFileSystem();
            copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
            fs.writeFilesystem(os);
          } else {
            IOUtils.copy(inputStream, os);
          }
        } else {
          IOUtils.copy(inputStream, os);
        }
      } catch (Exception e) {
        logger.warn("Ignoring unexpected exception trying to save embedded file " + name, e);
      } finally {
        if (os != null) {
          os.close();
        }
      }
    }
/**
 * Pulls together {@link IPortalDataType}, {@link IDataUpgrader}, and {@link IDataImporter}
 * implementations to handle data upgrade, import, export and removal operations.
 *
 * @author Eric Dalquist
 */
@Service("portalDataHandlerService")
public class JaxbPortalDataHandlerService implements IPortalDataHandlerService {

  /** Tracks the base import directory to allow for easier to read logging when importing */
  private static final ThreadLocal<String> IMPORT_BASE_DIR = new ThreadLocal<String>();

  private static final String REPORT_FORMAT = "%s,%s,%.2fms\n";

  private static final MediaType MT_JAVA_ARCHIVE = MediaType.application("java-archive");
  private static final MediaType MT_CPIO = MediaType.application("x-cpio");
  private static final MediaType MT_AR = MediaType.application("x-archive");
  private static final MediaType MT_TAR = MediaType.application("x-tar");
  private static final MediaType MT_BZIP2 = MediaType.application("x-bzip2");
  private static final MediaType MT_GZIP = MediaType.application("x-gzip");
  private static final MediaType MT_PACK200 = MediaType.application("x-java-pack200");
  private static final MediaType MT_XZ = MediaType.application("x-xz");

  protected final Logger logger = LoggerFactory.getLogger(getClass());

  // Order in which data must be imported
  private List<PortalDataKey> dataKeyImportOrder = Collections.emptyList();
  // Map to lookup the associated IPortalDataType for each known PortalDataKey
  private Map<PortalDataKey, IPortalDataType> dataKeyTypes = Collections.emptyMap();

  // Ant path matcher patterns that a file must match when scanning directories (unless a pattern is
  // explicitly specified)
  private Set<String> dataFileIncludes = Collections.emptySet();
  private Set<String> dataFileExcludes = ImmutableSet.copyOf(DirectoryScanner.getDefaultExcludes());

  // Data upgraders mapped by PortalDataKey
  private Map<PortalDataKey, IDataUpgrader> portalDataUpgraders = Collections.emptyMap();
  // Data importers mapped by PortalDataKey
  private Map<PortalDataKey, IDataImporter<Object>> portalDataImporters = Collections.emptyMap();

  // ExportAll data types
  private Set<IPortalDataType> exportAllPortalDataTypes = null;
  // All portal data types available for export
  private Set<IPortalDataType> exportPortalDataTypes = Collections.emptySet();
  // Data exporters mapped by IPortalDateType#getTypeId()
  private Map<String, IDataExporter<Object>> portalDataExporters = Collections.emptyMap();

  // All portal data types available for delete
  private Set<IPortalDataType> deletePortalDataTypes = Collections.emptySet();
  // Data deleters mapped by IPortalDateType#getTypeId()
  private Map<String, IDataDeleter<Object>> portalDataDeleters = Collections.emptyMap();

  private org.apereo.portal.utils.DirectoryScanner directoryScanner;
  private ExecutorService importExportThreadPool;
  private XmlUtilities xmlUtilities;

  private long maxWait = -1;
  private TimeUnit maxWaitTimeUnit = TimeUnit.MILLISECONDS;

  @Autowired
  public void setXmlUtilities(XmlUtilities xmlUtilities) {
    this.xmlUtilities = xmlUtilities;
  }

  @Autowired
  public void setImportExportThreadPool(
      @Qualifier("importExportThreadPool") ExecutorService importExportThreadPool) {
    this.importExportThreadPool = importExportThreadPool;
    this.directoryScanner = new ConcurrentDirectoryScanner(this.importExportThreadPool);
  }

  /** Maximum time to wait for an import, export, or delete to execute. */
  public void setMaxWait(long maxWait) {
    this.maxWait = maxWait;
  }

  /** {@link TimeUnit} for {@link #setMaxWait(long)} value. */
  public void setMaxWaitTimeUnit(TimeUnit maxWaitTimeUnit) {
    this.maxWaitTimeUnit = maxWaitTimeUnit;
  }

  /** Order in which data types should be imported. */
  @javax.annotation.Resource(name = "dataTypeImportOrder")
  public void setDataTypeImportOrder(List<IPortalDataType> dataTypeImportOrder) {
    final ArrayList<PortalDataKey> dataKeyImportOrder =
        new ArrayList<PortalDataKey>(dataTypeImportOrder.size() * 2);
    final Map<PortalDataKey, IPortalDataType> dataKeyTypes =
        new LinkedHashMap<PortalDataKey, IPortalDataType>(dataTypeImportOrder.size() * 2);

    for (final IPortalDataType portalDataType : dataTypeImportOrder) {
      final List<PortalDataKey> supportedDataKeys = portalDataType.getDataKeyImportOrder();
      for (final PortalDataKey portalDataKey : supportedDataKeys) {
        dataKeyImportOrder.add(portalDataKey);
        dataKeyTypes.put(portalDataKey, portalDataType);
      }
    }

    dataKeyImportOrder.trimToSize();
    this.dataKeyImportOrder = Collections.unmodifiableList(dataKeyImportOrder);
    this.dataKeyTypes = Collections.unmodifiableMap(dataKeyTypes);
  }

  /** Ant path matching patterns that files must match to be included */
  @javax.annotation.Resource(name = "dataFileIncludes")
  public void setDataFileIncludes(Set<String> dataFileIncludes) {
    this.dataFileIncludes = dataFileIncludes;
  }

  /**
   * Ant path matching patterns that exclude matched files. Defaults to {@link
   * DirectoryScanner#addDefaultExcludes()}
   */
  public void setDataFileExcludes(Set<String> dataFileExcludes) {
    this.dataFileExcludes = dataFileExcludes;
  }

  /** {@link IDataImporter} implementations to delegate import operations to. */
  @SuppressWarnings("unchecked")
  @Autowired(required = false)
  public void setDataImporters(Collection<IDataImporter<? extends Object>> dataImporters) {
    final Map<PortalDataKey, IDataImporter<Object>> dataImportersMap =
        new LinkedHashMap<PortalDataKey, IDataImporter<Object>>();

    for (final IDataImporter<?> dataImporter : dataImporters) {

      try {

        final Set<PortalDataKey> importDataKeys = dataImporter.getImportDataKeys();

        for (final PortalDataKey importDataKey : importDataKeys) {
          this.logger.debug(
              "Registering IDataImporter for '{}' - {}",
              new Object[] {importDataKey, dataImporter});
          final IDataImporter<Object> existing =
              dataImportersMap.put(importDataKey, (IDataImporter<Object>) dataImporter);
          if (existing != null) {
            this.logger.warn(
                "Duplicate IDataImporter PortalDataKey for {} Replacing {} with {}",
                new Object[] {importDataKey, existing, dataImporter});
          }
        }

      } catch (Exception exception) {
        logger.error("Failed to register data importer {}.", dataImporter, exception);
      }
    }

    this.portalDataImporters = Collections.unmodifiableMap(dataImportersMap);
  }

  /** {@link IDataExporter} implementations to delegate export operations to. */
  @SuppressWarnings("unchecked")
  @Autowired(required = false)
  public void setDataExporters(Collection<IDataExporter<? extends Object>> dataExporters) {
    final Map<String, IDataExporter<Object>> dataExportersMap =
        new LinkedHashMap<String, IDataExporter<Object>>();

    final Set<IPortalDataType> portalDataTypes = new LinkedHashSet<IPortalDataType>();

    for (final IDataExporter<?> dataExporter : dataExporters) {

      try {

        final IPortalDataType portalDataType = dataExporter.getPortalDataType();
        final String typeId = portalDataType.getTypeId();

        this.logger.debug(
            "Registering IDataExporter for '{}' - {}", new Object[] {typeId, dataExporter});
        final IDataExporter<Object> existing =
            dataExportersMap.put(typeId, (IDataExporter<Object>) dataExporter);
        if (existing != null) {
          this.logger.warn(
              "Duplicate IDataExporter typeId for {} Replacing {} with {}",
              new Object[] {typeId, existing, dataExporter});
        }

        portalDataTypes.add(portalDataType);

      } catch (Exception exception) {
        logger.error("Failed to register data exporter {}.", dataExporter, exception);
      }
    }

    this.portalDataExporters = Collections.unmodifiableMap(dataExportersMap);
    this.exportPortalDataTypes = Collections.unmodifiableSet(portalDataTypes);
  }

  /**
   * Optional set of all portal data types to export. If not specified all available portal data
   * types will be listed.
   */
  @javax.annotation.Resource(name = "exportAllPortalDataTypes")
  public void setExportAllPortalDataTypes(Set<IPortalDataType> exportAllPortalDataTypes) {
    this.exportAllPortalDataTypes = ImmutableSet.copyOf(exportAllPortalDataTypes);
  }

  /** {@link IDataDeleter} implementations to delegate delete operations to. */
  @SuppressWarnings("unchecked")
  @Autowired(required = false)
  public void setDataDeleters(Collection<IDataDeleter<? extends Object>> dataDeleters) {
    final Map<String, IDataDeleter<Object>> dataDeletersMap =
        new LinkedHashMap<String, IDataDeleter<Object>>();

    final Set<IPortalDataType> portalDataTypes = new LinkedHashSet<IPortalDataType>();

    for (final IDataDeleter<?> dataDeleter : dataDeleters) {

      try {

        final IPortalDataType portalDataType = dataDeleter.getPortalDataType();
        final String typeId = portalDataType.getTypeId();

        this.logger.debug(
            "Registering IDataDeleter for '{}' - {}", new Object[] {typeId, dataDeleter});
        final IDataDeleter<Object> existing =
            dataDeletersMap.put(typeId, (IDataDeleter<Object>) dataDeleter);
        if (existing != null) {
          this.logger.warn(
              "Duplicate IDataDeleter typeId for {} Replacing {} with {}",
              new Object[] {typeId, existing, dataDeleter});
        }

        portalDataTypes.add(portalDataType);

      } catch (Exception exception) {
        logger.error("Failed to register data deleter {}.", dataDeleter, exception);
      }
    }

    this.portalDataDeleters = Collections.unmodifiableMap(dataDeletersMap);
    this.deletePortalDataTypes = Collections.unmodifiableSet(portalDataTypes);
  }

  /** {@link IDataUpgrader} implementations to delegate upgrade operations to. */
  @Autowired(required = false)
  public void setDataUpgraders(Collection<IDataUpgrader> dataUpgraders) {
    final Map<PortalDataKey, IDataUpgrader> dataUpgraderMap =
        new LinkedHashMap<PortalDataKey, IDataUpgrader>();

    for (final IDataUpgrader dataUpgrader : dataUpgraders) {

      try {

        final Set<PortalDataKey> upgradeDataKeys = dataUpgrader.getSourceDataTypes();
        for (final PortalDataKey upgradeDataKey : upgradeDataKeys) {
          this.logger.debug(
              "Registering IDataUpgrader for '{}' - {}", upgradeDataKey, dataUpgrader);
          final IDataUpgrader existing = dataUpgraderMap.put(upgradeDataKey, dataUpgrader);
          if (existing != null) {
            this.logger.warn(
                "Duplicate IDataUpgrader PortalDataKey for {} Replacing {} with {}",
                new Object[] {upgradeDataKey, existing, dataUpgrader});
          }
        }

      } catch (Exception exception) {
        logger.error("Failed to register data upgrader {}.", dataUpgrader, exception);
      }
    }

    this.portalDataUpgraders = Collections.unmodifiableMap(dataUpgraderMap);
  }

  @Override
  public void importDataArchive(Resource archive, BatchImportOptions options) {
    try {
      importDataArchive(archive, archive.getInputStream(), options);
    } catch (IOException e) {
      throw new RuntimeException("Could not load InputStream for resource: " + archive, e);
    }
  }

  protected void importDataArchive(
      Resource archive, InputStream resourceStream, BatchImportOptions options) {
    BufferedInputStream bufferedResourceStream = null;
    try {
      // Make sure the stream is buffered
      if (resourceStream instanceof BufferedInputStream) {
        bufferedResourceStream = (BufferedInputStream) resourceStream;
      } else {
        bufferedResourceStream = new BufferedInputStream(resourceStream);
      }

      // Buffer up to 100MB, bad things will happen if we bust this buffer.
      // TODO see if there is a buffered stream that will write to a file once the buffer fills up
      bufferedResourceStream.mark(100 * 1024 * 1024);
      final MediaType type = getMediaType(bufferedResourceStream, archive.getFilename());

      if (MT_JAVA_ARCHIVE.equals(type)) {
        final ArchiveInputStream archiveStream = new JarArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MediaType.APPLICATION_ZIP.equals(type)) {
        final ArchiveInputStream archiveStream = new ZipArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_CPIO.equals(type)) {
        final ArchiveInputStream archiveStream = new CpioArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_AR.equals(type)) {
        final ArchiveInputStream archiveStream = new ArArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_TAR.equals(type)) {
        final ArchiveInputStream archiveStream = new TarArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_BZIP2.equals(type)) {
        final CompressorInputStream compressedStream =
            new BZip2CompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else if (MT_GZIP.equals(type)) {
        final CompressorInputStream compressedStream =
            new GzipCompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else if (MT_PACK200.equals(type)) {
        final CompressorInputStream compressedStream =
            new Pack200CompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else if (MT_XZ.equals(type)) {
        final CompressorInputStream compressedStream =
            new XZCompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else {
        throw new RuntimeException("Unrecognized archive media type: " + type);
      }
    } catch (IOException e) {
      throw new RuntimeException("Could not load InputStream for resource: " + archive, e);
    } finally {
      IOUtils.closeQuietly(bufferedResourceStream);
    }
  }

  /** Extracts the archive resource and then runs the batch-import process on it. */
  protected void importDataArchive(
      final Resource resource,
      final ArchiveInputStream resourceStream,
      BatchImportOptions options) {

    final File tempDir = Files.createTempDir();
    try {
      ArchiveEntry archiveEntry;
      while ((archiveEntry = resourceStream.getNextEntry()) != null) {
        final File entryFile = new File(tempDir, archiveEntry.getName());
        if (archiveEntry.isDirectory()) {
          entryFile.mkdirs();
        } else {
          entryFile.getParentFile().mkdirs();

          Files.copy(
              new InputSupplier<InputStream>() {
                @Override
                public InputStream getInput() throws IOException {
                  return new CloseShieldInputStream(resourceStream);
                }
              },
              entryFile);
        }
      }

      importDataDirectory(tempDir, null, options);
    } catch (IOException e) {
      throw new RuntimeException(
          "Failed to extract data from '" + resource + "' to '" + tempDir + "' for batch import.",
          e);
    } finally {
      FileUtils.deleteQuietly(tempDir);
    }
  }

  protected MediaType getMediaType(BufferedInputStream inputStream, String fileName)
      throws IOException {
    final TikaInputStream tikaInputStreamStream =
        TikaInputStream.get(new CloseShieldInputStream(inputStream));
    try {
      final Detector detector = new DefaultDetector();
      final Metadata metadata = new Metadata();
      metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);

      final MediaType type = detector.detect(tikaInputStreamStream, metadata);
      logger.debug("Determined '{}' for '{}'", type, fileName);
      return type;
    } catch (IOException e) {
      logger.warn("Failed to determine media type for '" + fileName + "' assuming XML", e);
      return null;
    } finally {
      IOUtils.closeQuietly(tikaInputStreamStream);

      // Reset the buffered stream to make up for anything read by the detector
      inputStream.reset();
    }
  }

  @Override
  public void importDataDirectory(
      File directory, String pattern, final BatchImportOptions options) {
    if (!directory.exists()) {
      throw new IllegalArgumentException(
          "The specified directory '" + directory + "' does not exist");
    }

    // Create the file filter to use when searching for files to import
    final FileFilter fileFilter;
    if (pattern != null) {
      fileFilter = new AntPatternFileFilter(true, false, pattern, this.dataFileExcludes);
    } else {
      fileFilter =
          new AntPatternFileFilter(true, false, this.dataFileIncludes, this.dataFileExcludes);
    }

    // Determine the parent directory to log to
    final File logDirectory = determineLogDirectory(options, "import");

    // Setup reporting file
    final File importReport = new File(logDirectory, "data-import.txt");
    final PrintWriter reportWriter;
    try {
      reportWriter =
          new PrintWriter(new PeriodicFlushingBufferedWriter(500, new FileWriter(importReport)));
    } catch (IOException e) {
      throw new RuntimeException("Failed to create FileWriter for: " + importReport, e);
    }

    // Convert directory to URI String to provide better logging output
    final URI directoryUri = directory.toURI();
    final String directoryUriStr = directoryUri.toString();
    IMPORT_BASE_DIR.set(directoryUriStr);
    try {
      // Scan the specified directory for files to import
      logger.info("Scanning for files to Import from: {}", directory);
      final PortalDataKeyFileProcessor fileProcessor =
          new PortalDataKeyFileProcessor(this.dataKeyTypes, options);
      this.directoryScanner.scanDirectoryNoResults(directory, fileFilter, fileProcessor);
      final long resourceCount = fileProcessor.getResourceCount();
      logger.info("Found {} files to Import from: {}", resourceCount, directory);

      // See if the import should fail on error
      final boolean failOnError = options != null ? options.isFailOnError() : true;

      // Map of files to import, grouped by type
      final ConcurrentMap<PortalDataKey, Queue<Resource>> dataToImport =
          fileProcessor.getDataToImport();

      // Import the data files
      for (final PortalDataKey portalDataKey : this.dataKeyImportOrder) {
        final Queue<Resource> files = dataToImport.remove(portalDataKey);
        if (files == null) {
          continue;
        }

        final Queue<ImportFuture<?>> importFutures = new LinkedList<ImportFuture<?>>();
        final List<FutureHolder<?>> failedFutures = new LinkedList<FutureHolder<?>>();

        final int fileCount = files.size();
        logger.info("Importing {} files of type {}", fileCount, portalDataKey);
        reportWriter.println(portalDataKey + "," + fileCount);

        while (!files.isEmpty()) {
          final Resource file = files.poll();

          // Check for completed futures on every iteration, needed to fail as fast as possible on
          // an import exception
          final List<FutureHolder<?>> newFailed =
              waitForFutures(importFutures, reportWriter, logDirectory, false);
          failedFutures.addAll(newFailed);

          final AtomicLong importTime = new AtomicLong(-1);

          // Create import task
          final Callable<Object> task =
              new CallableWithoutResult() {
                @Override
                protected void callWithoutResult() {
                  IMPORT_BASE_DIR.set(directoryUriStr);
                  importTime.set(System.nanoTime());
                  try {
                    importData(file, portalDataKey);
                  } finally {
                    importTime.set(System.nanoTime() - importTime.get());
                    IMPORT_BASE_DIR.remove();
                  }
                }
              };

          // Submit the import task
          final Future<?> importFuture = this.importExportThreadPool.submit(task);

          // Add the future for tracking
          importFutures.offer(new ImportFuture(importFuture, file, portalDataKey, importTime));
        }

        // Wait for all of the imports on of this type to complete
        final List<FutureHolder<?>> newFailed =
            waitForFutures(importFutures, reportWriter, logDirectory, true);
        failedFutures.addAll(newFailed);

        if (failOnError && !failedFutures.isEmpty()) {
          throw new RuntimeException(
              failedFutures.size()
                  + " "
                  + portalDataKey
                  + " entities failed to import.\n\n"
                  + "\tPer entity exception logs and a full report can be found in "
                  + logDirectory
                  + "\n");
        }

        reportWriter.flush();
      }

      if (!dataToImport.isEmpty()) {
        throw new IllegalStateException(
            "The following PortalDataKeys are not listed in the dataTypeImportOrder List: "
                + dataToImport.keySet());
      }

      logger.info("For a detailed report on the data import see " + importReport);
    } catch (InterruptedException e) {
      throw new RuntimeException("Interrupted while waiting for entities to import", e);
    } finally {
      IOUtils.closeQuietly(reportWriter);
      IMPORT_BASE_DIR.remove();
    }
  }

  /** Determine directory to log import/export reports to */
  private File determineLogDirectory(final BatchOptions options, String operation) {
    File logDirectoryParent = options != null ? options.getLogDirectoryParent() : null;
    if (logDirectoryParent == null) {
      logDirectoryParent = Files.createTempDir();
    }
    File logDirectory = new File(logDirectoryParent, "data-" + operation + "-reports");
    try {
      logDirectory = logDirectory.getCanonicalFile();
      FileUtils.deleteDirectory(logDirectory);
    } catch (IOException e) {
      throw new RuntimeException(
          "Failed to clean data-" + operation + " log directory: " + logDirectory, e);
    }
    logDirectory.mkdirs();
    return logDirectory;
  }

  @Override
  public void importData(final Resource resource) {
    this.importData(resource, null);
  }

  @Override
  public void importData(Source source) {
    this.importData(source, null);
  }

  @Override
  public final void importData(final Source source, PortalDataKey portalDataKey) {
    // Get a StAX reader for the source to determine info about the data to import
    final BufferedXMLEventReader bufferedXmlEventReader = createSourceXmlEventReader(source);

    // If no PortalDataKey was passed build it from the source
    if (portalDataKey == null) {
      final StartElement rootElement = StaxUtils.getRootElement(bufferedXmlEventReader);
      portalDataKey = new PortalDataKey(rootElement);
      bufferedXmlEventReader.reset();
    }

    final String systemId = source.getSystemId();

    // Post Process the PortalDataKey to see if more complex import operations are needed
    final IPortalDataType portalDataType = this.dataKeyTypes.get(portalDataKey);
    if (portalDataType == null) {
      throw new RuntimeException(
          "No IPortalDataType configured for "
              + portalDataKey
              + ", the resource will be ignored: "
              + getPartialSystemId(systemId));
    }
    final Set<PortalDataKey> postProcessedPortalDataKeys =
        portalDataType.postProcessPortalDataKey(systemId, portalDataKey, bufferedXmlEventReader);
    bufferedXmlEventReader.reset();

    // If only a single result from post processing import
    if (postProcessedPortalDataKeys.size() == 1) {
      this.importOrUpgradeData(
          systemId,
          DataAccessUtils.singleResult(postProcessedPortalDataKeys),
          bufferedXmlEventReader);
    }
    // If multiple results from post processing ordering is needed
    else {
      // Iterate over the data key order list to run the imports in the correct order
      for (final PortalDataKey orderedPortalDataKey : this.dataKeyImportOrder) {
        if (postProcessedPortalDataKeys.contains(orderedPortalDataKey)) {
          // Reset the to start of the XML document for each import/upgrade call
          bufferedXmlEventReader.reset();
          this.importOrUpgradeData(systemId, orderedPortalDataKey, bufferedXmlEventReader);
        }
      }
    }
  }

  /**
   * @param portalDataKey Optional PortalDataKey to use, useful for batch imports where
   *     post-processing of keys has already take place
   */
  protected final void importData(final Resource resource, final PortalDataKey portalDataKey) {
    final InputStream resourceStream;
    try {
      resourceStream = resource.getInputStream();
    } catch (IOException e) {
      throw new RuntimeException("Could not load InputStream for resource: " + resource, e);
    }

    try {
      final String resourceUri = ResourceUtils.getResourceUri(resource);
      this.importData(new StreamSource(resourceStream, resourceUri), portalDataKey);
    } finally {
      IOUtils.closeQuietly(resourceStream);
    }
  }

  protected String getPartialSystemId(String systemId) {
    final String directoryUriStr = IMPORT_BASE_DIR.get();
    if (directoryUriStr == null) {
      return systemId;
    }

    if (systemId.startsWith(directoryUriStr)) {
      return systemId.substring(directoryUriStr.length());
    }

    return systemId;
  }

  /** Run the import/update process on the data */
  protected final void importOrUpgradeData(
      String systemId, PortalDataKey portalDataKey, XMLEventReader xmlEventReader) {
    // See if there is a registered importer for the data, if so import
    final IDataImporter<Object> dataImporterExporter = this.portalDataImporters.get(portalDataKey);
    if (dataImporterExporter != null) {
      this.logger.debug("Importing: {}", getPartialSystemId(systemId));
      final Object data = unmarshallData(xmlEventReader, dataImporterExporter);
      dataImporterExporter.importData(data);
      this.logger.info("Imported : {}", getPartialSystemId(systemId));
      return;
    }

    // No importer, see if there is an upgrader, if so upgrade
    final IDataUpgrader dataUpgrader = this.portalDataUpgraders.get(portalDataKey);
    if (dataUpgrader != null) {
      this.logger.debug("Upgrading: {}", getPartialSystemId(systemId));

      // Convert the StAX stream to a DOM node, due to poor JDK support for StAX with XSLT
      final Node sourceNode;
      try {
        sourceNode = xmlUtilities.convertToDom(xmlEventReader);
      } catch (XMLStreamException e) {
        throw new RuntimeException("Failed to create StAXSource from original XML reader", e);
      }
      final DOMSource source = new DOMSource(sourceNode);

      final DOMResult result = new DOMResult();
      final boolean doImport = dataUpgrader.upgradeData(source, result);
      if (doImport) {
        // If the upgrader didn't handle the import as well wrap the result DOM in a new Source and
        // start the import process over again
        final org.w3c.dom.Node node = result.getNode();
        final PortalDataKey upgradedPortalDataKey = new PortalDataKey(node);
        if (this.logger.isTraceEnabled()) {
          this.logger.trace(
              "Upgraded: "
                  + getPartialSystemId(systemId)
                  + " to "
                  + upgradedPortalDataKey
                  + "\n\nSource XML: \n"
                  + XmlUtilitiesImpl.toString(source.getNode())
                  + "\n\nResult XML: \n"
                  + XmlUtilitiesImpl.toString(node));
        } else {
          this.logger.info(
              "Upgraded: {} to {}", getPartialSystemId(systemId), upgradedPortalDataKey);
        }
        final DOMSource upgradedSource = new DOMSource(node, systemId);
        this.importData(upgradedSource, upgradedPortalDataKey);
      } else {
        this.logger.info("Upgraded and Imported: {}", getPartialSystemId(systemId));
      }
      return;
    }

    // No importer or upgrader found, fail
    throw new IllegalArgumentException(
        "Provided data "
            + portalDataKey
            + " has no registered importer or upgrader support: "
            + systemId);
  }

  protected Object unmarshallData(
      final XMLEventReader bufferedXmlEventReader,
      final IDataImporter<Object> dataImporterExporter) {
    final Unmarshaller unmarshaller = dataImporterExporter.getUnmarshaller();

    try {
      final StAXSource source = new StAXSource(bufferedXmlEventReader);
      return unmarshaller.unmarshal(source);
    } catch (XmlMappingException e) {
      throw new RuntimeException("Failed to map provided XML to portal data", e);
    } catch (IOException e) {
      throw new RuntimeException("Failed to read the provided XML data", e);
    } catch (XMLStreamException e) {
      throw new RuntimeException("Failed to create StAX Source to read XML data", e);
    }
  }

  protected BufferedXMLEventReader createSourceXmlEventReader(final Source source) {
    // If it is a StAXSource see if we can do better handling of it
    if (source instanceof StAXSource) {
      final StAXSource staxSource = (StAXSource) source;
      XMLEventReader xmlEventReader = staxSource.getXMLEventReader();
      if (xmlEventReader != null) {
        if (xmlEventReader instanceof BufferedXMLEventReader) {
          final BufferedXMLEventReader bufferedXMLEventReader =
              (BufferedXMLEventReader) xmlEventReader;
          bufferedXMLEventReader.reset();
          bufferedXMLEventReader.mark(-1);
          return bufferedXMLEventReader;
        }

        return new BufferedXMLEventReader(xmlEventReader, -1);
      }
    }

    final XMLInputFactory xmlInputFactory = this.xmlUtilities.getXmlInputFactory();
    final XMLEventReader xmlEventReader;
    try {
      xmlEventReader = xmlInputFactory.createXMLEventReader(source);
    } catch (XMLStreamException e) {
      throw new RuntimeException("Failed to create XML Event Reader for data Source", e);
    }
    return new BufferedXMLEventReader(xmlEventReader, -1);
  }

  @Override
  public Iterable<IPortalDataType> getExportPortalDataTypes() {
    return this.exportPortalDataTypes;
  }

  @Override
  public Iterable<IPortalDataType> getDeletePortalDataTypes() {
    return this.deletePortalDataTypes;
  }

  @Override
  public Iterable<? extends IPortalData> getPortalData(String typeId) {
    final IDataExporter<Object> dataImporterExporter = getPortalDataExporter(typeId);
    return dataImporterExporter.getPortalData();
  }

  @Override
  public String exportData(String typeId, String dataId, Result result) {
    final IDataExporter<Object> portalDataExporter = this.getPortalDataExporter(typeId);
    final Object data = portalDataExporter.exportData(dataId);
    if (data == null) {
      return null;
    }

    final Marshaller marshaller = portalDataExporter.getMarshaller();
    try {
      marshaller.marshal(data, result);
      return portalDataExporter.getFileName(data);
    } catch (XmlMappingException e) {
      throw new RuntimeException("Failed to map provided portal data to XML", e);
    } catch (IOException e) {
      throw new RuntimeException("Failed to write the provided XML data", e);
    }
  }

  @Override
  public boolean exportData(String typeId, String dataId, File directory) {
    directory.mkdirs();

    final File exportTempFile;
    try {
      exportTempFile =
          File.createTempFile(
              SafeFilenameUtils.makeSafeFilename(StringUtils.rightPad(dataId, 2, '-') + "-"),
              SafeFilenameUtils.makeSafeFilename("." + typeId),
              directory);
    } catch (IOException e) {
      throw new RuntimeException(
          "Could not create temp file to export " + typeId + " " + dataId, e);
    }

    try {
      final String fileName = this.exportData(typeId, dataId, new StreamResult(exportTempFile));
      if (fileName == null) {
        logger.info("Skipped: type={} id={}", typeId, dataId);
        return false;
      }

      final File destFile = new File(directory, fileName + "." + typeId + ".xml");
      if (destFile.exists()) {
        logger.warn(
            "Exporting "
                + typeId
                + " "
                + dataId
                + " but destination file already exists, it will be overwritten: "
                + destFile);
        destFile.delete();
      }
      FileUtils.moveFile(exportTempFile, destFile);
      logger.info("Exported: {}", destFile);

      return true;
    } catch (Exception e) {
      if (e instanceof RuntimeException) {
        throw (RuntimeException) e;
      }

      throw new RuntimeException("Failed to export " + typeId + " " + dataId, e);
    } finally {
      FileUtils.deleteQuietly(exportTempFile);
    }
  }

  @Override
  public void exportAllDataOfType(Set<String> typeIds, File directory, BatchExportOptions options) {
    final Queue<ExportFuture<?>> exportFutures = new ConcurrentLinkedQueue<ExportFuture<?>>();
    final boolean failOnError = options != null ? options.isFailOnError() : true;

    // Determine the parent directory to log to
    final File logDirectory = determineLogDirectory(options, "export");

    // Setup reporting file
    final File exportReport = new File(logDirectory, "data-export.txt");
    final PrintWriter reportWriter;
    try {
      reportWriter = new PrintWriter(new BufferedWriter(new FileWriter(exportReport)));
    } catch (IOException e) {
      throw new RuntimeException("Failed to create FileWriter for: " + exportReport, e);
    }

    try {
      for (final String typeId : typeIds) {
        final List<FutureHolder<?>> failedFutures = new LinkedList<FutureHolder<?>>();

        final File typeDir = new File(directory, typeId);
        logger.info("Adding all data of type {} to export queue: {}", typeId, typeDir);

        reportWriter.println(typeId + "," + typeDir);

        final Iterable<? extends IPortalData> dataForType = this.getPortalData(typeId);
        for (final IPortalData data : dataForType) {
          final String dataId = data.getDataId();

          // Check for completed futures on every iteration, needed to fail as fast as possible on
          // an import exception
          final List<FutureHolder<?>> newFailed =
              waitForFutures(exportFutures, reportWriter, logDirectory, false);
          failedFutures.addAll(newFailed);

          final AtomicLong exportTime = new AtomicLong(-1);

          // Create export task
          Callable<Object> task =
              new CallableWithoutResult() {
                @Override
                protected void callWithoutResult() {
                  exportTime.set(System.nanoTime());
                  try {
                    exportData(typeId, dataId, typeDir);
                  } finally {
                    exportTime.set(System.nanoTime() - exportTime.get());
                  }
                }
              };

          // Submit the export task
          final Future<?> exportFuture = this.importExportThreadPool.submit(task);

          // Add the future for tracking
          final ExportFuture futureHolder =
              new ExportFuture(exportFuture, typeId, dataId, exportTime);
          exportFutures.offer(futureHolder);
        }

        final List<FutureHolder<?>> newFailed =
            waitForFutures(exportFutures, reportWriter, logDirectory, true);
        failedFutures.addAll(newFailed);

        reportWriter.flush();

        if (failOnError && !failedFutures.isEmpty()) {
          throw new RuntimeException(
              failedFutures.size()
                  + " "
                  + typeId
                  + " entities failed to export.\n"
                  + "\tPer entity exception logs and a full report can be found in "
                  + logDirectory);
        }
      }
    } catch (InterruptedException e) {
      throw new RuntimeException("Interrupted while waiting for entities to export", e);
    } finally {
      IOUtils.closeQuietly(reportWriter);
    }
  }

  @Override
  public void exportAllData(File directory, BatchExportOptions options) {
    final Set<IPortalDataType> portalDataTypes;
    if (this.exportAllPortalDataTypes != null) {
      portalDataTypes = this.exportAllPortalDataTypes;
    } else {
      portalDataTypes = this.exportPortalDataTypes;
    }

    final Set<String> typeIds = new LinkedHashSet<String>();
    for (final IPortalDataType portalDataType : portalDataTypes) {
      typeIds.add(portalDataType.getTypeId());
    }
    this.exportAllDataOfType(typeIds, directory, options);
  }

  protected IDataExporter<Object> getPortalDataExporter(String typeId) {
    final IDataExporter<Object> dataExporter = this.portalDataExporters.get(typeId);
    if (dataExporter == null) {
      throw new IllegalArgumentException("No IDataExporter exists for: " + typeId);
    }
    return dataExporter;
  }

  @Override
  public void deleteData(String typeId, String dataId) {
    final IDataDeleter<Object> dataDeleter = this.portalDataDeleters.get(typeId);
    if (dataDeleter == null) {
      throw new IllegalArgumentException("No IDataDeleter exists for: " + typeId);
    }

    final Object data = dataDeleter.deleteData(dataId);
    if (data != null) {
      logger.info("Deleted data " + dataId + " of type " + typeId);
    } else {
      logger.info("No data " + dataId + " of type " + typeId + " exists to delete");
    }
  }

  /**
   * Used by batch import and export to wait for queued tasks to complete. Handles fail-fast
   * behavior if any of the tasks threw and exception by canceling all queued futures and logging a
   * summary of the failures. All completed futures are removed from the queue.
   *
   * @param futures Queued futures to check for completeness
   * @param wait If true it will wait for all futures to complete, if false only check for completed
   *     futures
   * @return a list of futures that either threw exceptions or timed out
   */
  protected List<FutureHolder<?>> waitForFutures(
      final Queue<? extends FutureHolder<?>> futures,
      final PrintWriter reportWriter,
      final File reportDirectory,
      final boolean wait)
      throws InterruptedException {

    final List<FutureHolder<?>> failedFutures = new LinkedList<FutureHolder<?>>();

    for (Iterator<? extends FutureHolder<?>> futuresItr = futures.iterator();
        futuresItr.hasNext(); ) {
      final FutureHolder<?> futureHolder = futuresItr.next();

      // If waiting, or if not waiting but the future is already done do the get
      final Future<?> future = futureHolder.getFuture();
      if (wait || (!wait && future.isDone())) {
        futuresItr.remove();

        try {
          // Don't bother doing a get() on canceled futures
          if (!future.isCancelled()) {
            if (this.maxWait > 0) {
              future.get(this.maxWait, this.maxWaitTimeUnit);
            } else {
              future.get();
            }

            reportWriter.printf(
                REPORT_FORMAT,
                "SUCCESS",
                futureHolder.getDescription(),
                futureHolder.getExecutionTimeMillis());
          }
        } catch (CancellationException e) {
          // Ignore cancellation exceptions
        } catch (ExecutionException e) {
          logger.error("Failed: " + futureHolder);

          futureHolder.setError(e);
          failedFutures.add(futureHolder);
          reportWriter.printf(
              REPORT_FORMAT,
              "FAIL",
              futureHolder.getDescription(),
              futureHolder.getExecutionTimeMillis());

          try {
            final String dataReportName =
                SafeFilenameUtils.makeSafeFilename(
                    futureHolder.getDataType() + "_" + futureHolder.getDataName() + ".txt");
            final File dataReportFile = new File(reportDirectory, dataReportName);
            final PrintWriter dataReportWriter =
                new PrintWriter(new BufferedWriter(new FileWriter(dataReportFile)));
            try {
              dataReportWriter.println(
                  "FAIL: " + futureHolder.getDataType() + " - " + futureHolder.getDataName());
              dataReportWriter.println(
                  "--------------------------------------------------------------------------------");
              e.getCause().printStackTrace(dataReportWriter);
            } finally {
              IOUtils.closeQuietly(dataReportWriter);
            }
          } catch (Exception re) {
            logger.warn(
                "Failed to write error report for failed "
                    + futureHolder
                    + ", logging root failure here",
                e.getCause());
          }
        } catch (TimeoutException e) {
          logger.warn("Failed: " + futureHolder);

          futureHolder.setError(e);
          failedFutures.add(futureHolder);
          future.cancel(true);
          reportWriter.printf(
              REPORT_FORMAT,
              "TIMEOUT",
              futureHolder.getDescription(),
              futureHolder.getExecutionTimeMillis());
        }
      }
    }

    return failedFutures;
  }

  private abstract static class FutureHolder<T> {
    private final Future<T> future;
    private final AtomicLong time;
    private Exception error;

    public FutureHolder(Future<T> future, AtomicLong time) {
      this.future = future;
      this.time = time;
    }

    public Future<T> getFuture() {
      return this.future;
    }

    public double getExecutionTimeMillis() {
      final long t = time.get();
      if (!future.isDone()) {
        return System.nanoTime() - t;
      }
      return t / 1000000.0;
    }

    public Exception getError() {
      return error;
    }

    public void setError(Exception error) {
      this.error = error;
    }

    public abstract String getDescription();

    public abstract String getDataType();

    public abstract String getDataName();
  }

  private static class ImportFuture<T> extends FutureHolder<T> {
    private final Resource resource;
    private final PortalDataKey dataKey;

    public ImportFuture(
        Future<T> future, Resource resource, PortalDataKey dataKey, AtomicLong importTime) {
      super(future, importTime);
      this.resource = resource;
      this.dataKey = dataKey;
    }

    @Override
    public String getDescription() {
      return this.resource.getDescription();
    }

    @Override
    public String getDataType() {
      return dataKey.getName().getLocalPart();
    }

    @Override
    public String getDataName() {
      return this.resource.getFilename();
    }

    @Override
    public String toString() {
      return "importing " + this.getDescription();
    }
  }

  private static class ExportFuture<T> extends FutureHolder<T> {
    private final String typeId;
    private final String dataId;

    public ExportFuture(Future<T> future, String typeId, String dataId, AtomicLong exportTime) {
      super(future, exportTime);
      this.typeId = typeId;
      this.dataId = dataId;
    }

    @Override
    public String getDescription() {
      return "type=" + this.typeId + ", dataId=" + this.dataId;
    }

    @Override
    public String getDataType() {
      return this.typeId;
    }

    @Override
    public String getDataName() {
      return this.dataId;
    }

    @Override
    public String toString() {
      return "exporting " + this.getDescription();
    }
  }
}
Example #30
0
  public ParseResult getParse(Content content) {
    String mimeType = content.getContentType();

    URL base;
    try {
      base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
      return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }

    // get the right parser using the mime type as a clue
    Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
    byte[] raw = content.getContent();

    if (parser == null) {
      String message = "Can't retrieve Tika parser for mime-type " + mimeType;
      LOG.error(message);
      return new ParseStatus(ParseStatus.FAILED, message)
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    LOG.debug("Using Tika parser " + parser.getClass().getName() + " for mime-type " + mimeType);

    Metadata tikamd = new Metadata();

    HTMLDocumentImpl doc = new HTMLDocumentImpl();
    doc.setErrorChecking(false);
    DocumentFragment root = doc.createDocumentFragment();
    DOMBuilder domhandler = new DOMBuilder(doc, root);
    ParseContext context = new ParseContext();
    try {
      parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd, context);
    } catch (Exception e) {
      LOG.error("Error parsing " + content.getUrl(), e);
      return new ParseStatus(ParseStatus.FAILED, e.getMessage())
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    HTMLMetaTags metaTags = new HTMLMetaTags();
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();

    // we have converted the sax events generated by Tika into a DOM object
    // so we can now use the usual HTML resources from Nutch
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }

    // check meta directives
    if (!metaTags.getNoIndex()) { // okay to index
      StringBuffer sb = new StringBuffer();
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting text...");
      }
      utils.getText(sb, root); // extract text
      text = sb.toString();
      sb.setLength(0);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting title...");
      }
      utils.getTitle(sb, root); // extract title
      title = sb.toString().trim();
    }

    if (!metaTags.getNoFollow()) { // okay to follow links
      ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
      URL baseTag = utils.getBase(root);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting links...");
      }
      utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
      outlinks = l.toArray(new Outlink[l.size()]);
      if (LOG.isTraceEnabled()) {
        LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
      }
    }

    // populate Nutch metadata with Tika metadata
    String[] TikaMDNames = tikamd.names();
    for (String tikaMDName : TikaMDNames) {
      if (tikaMDName.equalsIgnoreCase(Metadata.TITLE)) continue;
      // TODO what if multivalued?
      nutchMetadata.add(tikaMDName, tikamd.get(tikaMDName));
    }

    // no outlinks? try OutlinkExtractor e.g works for mime types where no
    // explicit markup for anchors

    if (outlinks.length == 0) {
      outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    }

    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
      status.setArgs(
          new String[] {
            metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime())
          });
    }
    ParseData parseData =
        new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata);
    ParseResult parseResult =
        ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));

    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) { // not okay to cache
      for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
        entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    }
    return filteredParse;
  }