Ejemplo n.º 1
   * Get the Mime type of an Asset based on its type. If the Asset already has the "content-type"
   * property set, we return that. Otherwise the Apache Tika library is used to do file type
   * detection.
   * @return A string representation of the content type suitable for use in an HTTP header. Eg.
   *     "image/jpeg" for a jpeg image.
  public <T> String getMimeType(Entity entity, T type) {

    Map<String, Object> fileMetadata = AssetUtils.getFileMetadata(entity);
    if (fileMetadata.get(AssetUtils.CONTENT_TYPE) != null) {
      return (String) fileMetadata.get(AssetUtils.CONTENT_TYPE);

    Metadata metadata = new Metadata();
    MediaType mediaType = MediaType.OCTET_STREAM;
    try {
      if (type instanceof byte[]) {

        ByteArrayInputStream bais = new ByteArrayInputStream((byte[]) type);
        mediaType = detector.detect(bais, metadata);
      } else if (type instanceof File) {

        InputStream fis = new BufferedInputStream(new FileInputStream((File) type));
        try {
          mediaType = detector.detect(fis, metadata);
        } finally {
      } else {
        return mediaType.toString();

      fileMetadata.put(AssetUtils.CONTENT_TYPE, mediaType.toString());
    } catch (IOException e) {
      LOG.error("error detecting mime type", e);

    return mediaType.toString();
  public String getContentType(String fileName) {
    if (Validator.isNull(fileName)) {
      return ContentTypes.APPLICATION_OCTET_STREAM;

    try {
      Metadata metadata = new Metadata();

      metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);

      MediaType mediaType = _detector.detect(null, metadata);

      String contentType = mediaType.toString();

      if (!contentType.contains("tika")) {
        return contentType;
      } else if (_log.isDebugEnabled()) {
        _log.debug("Retrieved invalid content type " + contentType);
    } catch (Exception e) {
      _log.error(e, e);

    return ContentTypes.APPLICATION_OCTET_STREAM;
Ejemplo n.º 3
   * We don't currently support the .xlsb file format (an OOXML container with binary blobs), but we
   * shouldn't break on these files either (TIKA-826)
  public void testExcelXLSB() throws Exception {
    Detector detector = new DefaultDetector();
    AutoDetectParser parser = new AutoDetectParser();

    Metadata m = new Metadata();
    m.add(Metadata.RESOURCE_NAME_KEY, "excel.xlsb");

    // Should be detected correctly
    MediaType type;
    try (InputStream input =
        ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
      type = detector.detect(input, m);
      assertEquals("application/vnd.ms-excel.sheet.binary.macroenabled.12", type.toString());

    // OfficeParser won't handle it
    assertEquals(false, (new OfficeParser()).getSupportedTypes(new ParseContext()).contains(type));

    // OOXMLParser won't handle it
    assertEquals(false, (new OOXMLParser()).getSupportedTypes(new ParseContext()).contains(type));

    // AutoDetectParser doesn't break on it
    try (InputStream input =
        ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL.xlsb")) {
      ContentHandler handler = new BodyContentHandler(-1);
      ParseContext context = new ParseContext();
      context.set(Locale.class, Locale.US);
      parser.parse(input, handler, m, context);

      String content = handler.toString();
      assertEquals("", content);
    public void parse(
        InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
      // Is it a supported image?
      String filename = metadata.get(Metadata.RESOURCE_NAME_KEY);
      String type = metadata.get(Metadata.CONTENT_TYPE);
      boolean accept = false;

      if (type != null) {
        for (MediaType mt : types) {
          if (mt.toString().equals(type)) {
            accept = true;
      if (filename != null) {
        for (MediaType mt : types) {
          String ext = "." + mt.getSubtype();
          if (filename.endsWith(ext)) {
            accept = true;

      if (!accept) return;

      handleImage(stream, filename, type);
Ejemplo n.º 5
 /** Returns true if mediaType falls withing the given range (pattern), false otherwise */
 private boolean isMediaTypeMatch(MediaType mediaType, MediaType rangePattern) {
   String WILDCARD = "*";
   String rangePatternType = rangePattern.getType();
   String rangePatternSubtype = rangePattern.getSubtype();
   return (rangePatternType.equals(WILDCARD) || rangePatternType.equals(mediaType.getType()))
       && (rangePatternSubtype.equals(WILDCARD)
           || rangePatternSubtype.equals(mediaType.getSubtype()));
 static {
   SUPPORTED_MIMETYPES = new ArrayList<String>();
   Parser p = new PackageParser();
   for (MediaType mt : p.getSupportedTypes(null)) {
     // Tika can probably do some useful text
Ejemplo n.º 7
  public static String getMimeTypeFromContentType(String contentType) {
    String result = "";
    MediaType mt = MediaType.parse(contentType);
    if (mt != null) {
      result = mt.getType() + "/" + mt.getSubtype();

    return result;
Ejemplo n.º 8
  public static String getCharsetFromContentType(String contentType) {
    String result = "";
    MediaType mt = MediaType.parse(contentType);
    if (mt != null) {
      String charset = mt.getParameters().get("charset");
      if (charset != null) {
        result = charset;

    return result;
Ejemplo n.º 9
  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    // Automatically detect the character encoding
    AutoDetectReader reader =
        new AutoDetectReader(
            new CloseShieldInputStream(stream), metadata, context.get(ServiceLoader.class, LOADER));
    try {
      Charset charset = reader.getCharset();
      String previous = metadata.get(Metadata.CONTENT_TYPE);
      MediaType contentType = null;
      if (previous == null || previous.startsWith("text/html")) {
        contentType = new MediaType(MediaType.TEXT_HTML, charset);
      } else if (previous.startsWith("application/xhtml+xml")) {
        contentType = new MediaType(XHTML, charset);
      } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
        contentType = new MediaType(WAP_XHTML, charset);
      } else if (previous.startsWith("application/x-asp")) {
        contentType = new MediaType(X_ASP, charset);
      if (contentType != null) {
        metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
      // deprecated, see TIKA-431
      metadata.set(Metadata.CONTENT_ENCODING, charset.name());

      // Get the HTML mapper from the parse context
      HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper());

      // Parse the HTML document
      org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();

      // Use schema from context or default
      Schema schema = context.get(Schema.class, HTML_SCHEMA);

      // TIKA-528: Reuse share schema to avoid heavy instantiation
      parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
      // TIKA-599: Shared schema is thread-safe only if bogons are ignored
      parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);

          new XHTMLDowngradeHandler(new HtmlHandler(mapper, handler, metadata)));

    } finally {
Ejemplo n.º 10
 * A more "natural" implementation of an XML parser. Instead of "generating" HTML-like wrapper
 * events and then producing the PCDATA (only - this is the reason why the default XMLParser should
 * be called "embedded"), this parser produces the actual XML start and end document and tag events
 * (that get wrapped by Tika's own {@link org.apache.tika.parser.xml.XMLParser XMLParser}).
 * Furthermore, this parser semi-structures the element's PCDATA text by separating content from
 * different elements by linebreaks, indenting PCDATA content according to the current element's
 * depth, and drops any ("ignorable") character stretches consisting only of spaces.
 * @author Florian Leitner
public class UnembeddedXMLParser extends AbstractParser {
  /** Serial version UID */
  private static final long serialVersionUID = -6028860725229212437L;
  /** Only support XML */
  private static final Set<MediaType> SUPPORTED_TYPES =
          new HashSet<MediaType>(Arrays.asList(MediaType.application("xml"))));

  /** {@inheritDoc} */
  public Set<MediaType> getSupportedTypes(ParseContext context) {

   * Parse the input stream with a SAX parser. Wraps the content handler with an {@link
   * org.apache.tika.sax.OfflineContentHandler} to avoid that any namespace lookups are made. In
   * addition, by overriding {@link #getContentHandler(ContentHandler, Metadata, ParseContext)}, it
   * is possible to add additional wrappers.
   * @param stream that should be parsed
   * @param handler that will receive the SAX events
   * @param metadata of current document stream
   * @param context of current parse
   * @throws IOException if the stream cannot be read
   * @throws SAXException if the SAX parsing fails.
   * @throws TikaException if the XML parsing fails.
  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    final TaggedContentHandler tagged = new TaggedContentHandler(handler);
    if (metadata.get(HttpHeaders.CONTENT_TYPE) == null) {
      metadata.set(HttpHeaders.CONTENT_TYPE, "application/xml");
    try {
              new CloseShieldInputStream(stream),
              new OfflineContentHandler(getContentHandler(tagged, metadata, context)));
    } catch (final SAXException e) {
      throw new TikaException("XML parse error", e);

   * Return the handler (ie., does nothing). This method can be overridden to add wrap the content
   * handler with additional handlers.
   * @param handler to wrap
   * @param metadata of current document
   * @param context of current parse
   * @return
  protected ContentHandler getContentHandler(
      ContentHandler handler, Metadata metadata, ParseContext context) {
    return handler;
Ejemplo n.º 11
  * Detects the content type of the given input event. Returns <code>application/octet-stream
  * </code> if the type of the event can not be detected.
  * <p>It is legal for the event headers or body to be empty. The detector may read bytes from
  * the start of the body stream to help in type detection.
  * @return detected media type, or <code>application/octet-stream</code>
 private String getMediaType(InputStream in, Metadata metadata, boolean excludeParameters) {
   MediaType mediaType;
   try {
     mediaType = getDetector().detect(in, metadata);
   } catch (IOException e) {
     throw new MorphlineRuntimeException(e);
   String mediaTypeStr = mediaType.toString();
   if (excludeParameters) {
     int i = mediaTypeStr.indexOf(';');
     if (i >= 0) {
       mediaTypeStr = mediaTypeStr.substring(0, i);
   return mediaTypeStr;
Ejemplo n.º 12
/** Tika Parser for Microsoft Project MPX files (Text based) */
public class MPXParser extends AbstractParser {
  private static final long serialVersionUID = -4791025107910605527L;

  private static List<MediaType> TYPES =
      Arrays.asList(new MediaType[] {MediaType.application("x-project")});

  public Set<MediaType> getSupportedTypes(ParseContext context) {
    return new HashSet<MediaType>(TYPES);

  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, TikaException, SAXException {
    MPXReader reader = new MPXReader();
    ProjectFile project = null;

    try {
      project = reader.read(stream);
    } catch (MPXJException e) {
      throw new TikaException("Error reading MPX file", e);

    // Extract helpful information out
    ProjectFileProcessor.parse(project, handler, metadata, context);
Ejemplo n.º 13
  protected void importDataArchive(
      Resource archive, InputStream resourceStream, BatchImportOptions options) {
    BufferedInputStream bufferedResourceStream = null;
    try {
      // Make sure the stream is buffered
      if (resourceStream instanceof BufferedInputStream) {
        bufferedResourceStream = (BufferedInputStream) resourceStream;
      } else {
        bufferedResourceStream = new BufferedInputStream(resourceStream);

      // Buffer up to 100MB, bad things will happen if we bust this buffer.
      // TODO see if there is a buffered stream that will write to a file once the buffer fills up
      bufferedResourceStream.mark(100 * 1024 * 1024);
      final MediaType type = getMediaType(bufferedResourceStream, archive.getFilename());

      if (MT_JAVA_ARCHIVE.equals(type)) {
        final ArchiveInputStream archiveStream = new JarArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MediaType.APPLICATION_ZIP.equals(type)) {
        final ArchiveInputStream archiveStream = new ZipArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_CPIO.equals(type)) {
        final ArchiveInputStream archiveStream = new CpioArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_AR.equals(type)) {
        final ArchiveInputStream archiveStream = new ArArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_TAR.equals(type)) {
        final ArchiveInputStream archiveStream = new TarArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_BZIP2.equals(type)) {
        final CompressorInputStream compressedStream =
            new BZip2CompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else if (MT_GZIP.equals(type)) {
        final CompressorInputStream compressedStream =
            new GzipCompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else if (MT_PACK200.equals(type)) {
        final CompressorInputStream compressedStream =
            new Pack200CompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else if (MT_XZ.equals(type)) {
        final CompressorInputStream compressedStream =
            new XZCompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else {
        throw new RuntimeException("Unrecognized archive media type: " + type);
    } catch (IOException e) {
      throw new RuntimeException("Could not load InputStream for resource: " + archive, e);
    } finally {
    private TikaImageExtractingParser(RenderingContext renderingContext) {
      this.renderingContext = renderingContext;

      // Our expected types
      types = new HashSet<MediaType>();

      // Are images going in the same place as the HTML?
      if (renderingContext.getParamWithDefault(PARAM_IMAGES_SAME_FOLDER, false)) {
        RenditionLocation location =
        imgFolder = location.getParentRef();
        if (logger.isDebugEnabled()) {
          logger.debug("Using imgFolder: " + imgFolder);
  public String getContentType(InputStream inputStream, String fileName) {
    if ((inputStream == null) && Validator.isNull(fileName)) {
      return ContentTypes.APPLICATION_OCTET_STREAM;

    String contentType = null;

    try {
      Metadata metadata = new Metadata();

      metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);

      MediaType mediaType = _detector.detect(TikaInputStream.get(inputStream), metadata);

      contentType = mediaType.toString();

      if (contentType.contains("tika")) {
        if (_log.isDebugEnabled()) {
          _log.debug("Retrieved invalid content type " + contentType);

        contentType = getContentType(fileName);

      if (contentType.contains("tika")) {
        if (_log.isDebugEnabled()) {
          _log.debug("Retrieved invalid content type " + contentType);

        contentType = ContentTypes.APPLICATION_OCTET_STREAM;
    } catch (Exception e) {
      _log.error(e, e);

      contentType = ContentTypes.APPLICATION_OCTET_STREAM;

    return contentType;
Ejemplo n.º 16
  /** @return SiteMap/SiteMapIndex given a content type, byte content and the URL of a sitemap */
  public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url)
      throws UnknownFormatException, IOException {
    MediaType mediaType = MediaType.parse(contentType);

    // Octet-stream is the father of all binary types
    while (mediaType != null && !mediaType.equals(MediaType.OCTET_STREAM)) {
      if (XML_MEDIA_TYPES.contains(mediaType)) {
        return processXml(url, content);
      } else if (TEXT_MEDIA_TYPES.contains(mediaType)) {
        return (AbstractSiteMap) processText(url.toString(), content);
      } else if (GZ_MEDIA_TYPES.contains(mediaType)) {
        return processGzip(url, content);
      } else {
        mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check
        // parent
        return parseSiteMap(mediaType.toString(), content, url);

    throw new UnknownFormatException(
        "Can't parse a sitemap with the MediaType of: " + contentType + " (at: " + url + ")");
Ejemplo n.º 17
  public MediaType detect(TikaInputStream input, Metadata metadata) throws IOException {
    ZipFile zip = new ZipFile(input.getFile());
    for (ZipEntry entry : Collections.list(zip.entries())) {
      // Is it an Open Document file?
      if (entry.getName().equals("mimetype")) {
        InputStream stream = zip.getInputStream(entry);
        try {
          return fromString(IOUtils.toString(stream, "UTF-8"));
        } finally {
      } else if (entry.getName().equals("_rels/.rels")
          || entry.getName().equals("[Content_Types].xml")) {
        // Office Open XML File
        // As POI to open and investigate it for us
        try {
          OPCPackage pkg = OPCPackage.open(input.getFile().toString());

          PackageRelationshipCollection core =
          if (core.size() != 1) {
            throw new IOException(
                "Invalid OOXML Package received - expected 1 core document, found " + core.size());

          // Get the type of the core document part
          PackagePart corePart = pkg.getPart(core.getRelationship(0));
          String coreType = corePart.getContentType();

          // Turn that into the type of the overall document
          String docType = coreType.substring(0, coreType.lastIndexOf('.'));
          return fromString(docType);
        } catch (InvalidFormatException e) {
          throw new IOException("Office Open XML File detected, but corrupted - " + e.getMessage());
      } else if (entry.getName().equals("buildVersionHistory.plist")) {
        // This is an iWork document

        // Reset and ask
        zip = new ZipFile(input.getFile());
        return IWorkPackageParser.identifyType(zip);
      } else if (entry.getName().equals("META-INF/")) {
        // Java Jar
        return MediaType.application("java-archive");

    return MediaType.APPLICATION_ZIP;
Ejemplo n.º 18
   * Performs a one time intialization of Tika's Media-Type components and media type collection
   * constants <br>
   * Please note that this is a private static method which is called once per CLASS (not per
   * instance / object)
  private static void initMediaTypes() {
    /* XML media types (and all aliases) */

    /* TEXT media types (and all aliases) */

    /* GZIP media types (and all aliases) */
    MediaType gzipMediaType = MediaType.parse("application/gzip");
Ejemplo n.º 19
  private static Metadata tika_parse(File audioFile) {
    Metadata metadata = new Metadata();
    try {
      String filetype = new Tika().detect(audioFile);
      metadata.set("tika.filetype", filetype);
      metadata.set("file.size", Long.toString(audioFile.length()));

      BufferedInputStream inputStream = new BufferedInputStream(new FileInputStream(audioFile));
      new AutoDetectParser().parse(inputStream, new BodyContentHandler(), metadata);

      for (String key : metadata.names()) {
        StringBuilder dataBuilder = new StringBuilder();
        if (metadata.isMultiValued(key)) {
          for (String val : metadata.getValues(key)) {
            if (dataBuilder.length() > 1) {
              dataBuilder.append(", ");
        } else {
        metadata.set(key, dataBuilder.toString().trim());

      inputStream = new BufferedInputStream(new FileInputStream(audioFile));
      MediaType media = new DefaultDetector().detect(inputStream, new Metadata());
      metadata.set("media", media.toString());
    } catch (SAXException | IOException | TikaException e) {
          "tika_parse error processing file (" + audioFile.getName() + "): " + e.getMessage());
    return metadata;
Ejemplo n.º 20
  If Tesseract is found, test we retrieve the proper number of supporting Parsers.
  public void offersTypesIfFound() throws Exception {
    TesseractOCRParser parser = new TesseractOCRParser();
    DefaultParser defaultParser = new DefaultParser();

    ParseContext parseContext = new ParseContext();
    MediaType png = MediaType.image("png");

    // Assuming that Tesseract is on the path, we should find 5 Parsers that support PNG.

    assertEquals(5, parser.getSupportedTypes(parseContext).size());

    // DefaultParser will now select the TesseractOCRParser.
        TesseractOCRParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
Ejemplo n.º 21
  protected Parser getParser(Metadata metadata, ParseContext context) {
    Map<MediaType, Parser> map = getParsers(context);
    MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
    if (type != null) {
      // We always work on the normalised, canonical form
      type = registry.normalize(type);
    while (type != null) {
      // Try finding a parser for the type
      Parser parser = map.get(type);
      if (parser != null) {
        return parser;

      // Failing that, try for the parent of the type
      type = registry.getSupertype(type);
    return fallback;
   * (non-Javadoc)
   * @see org.alfresco.repo.rendition.executer.AbstractRenderingEngine#render(org.alfresco.repo.rendition.executer.AbstractRenderingEngine.RenderingContext)
  protected void render(RenderingContext context) {
    ContentReader contentReader = context.makeContentReader();
    String sourceMimeType = contentReader.getMimetype();

    // Check that Tika supports the supplied file
    AutoDetectParser p = new AutoDetectParser(tikaConfig);
    MediaType sourceMediaType = MediaType.parse(sourceMimeType);
    if (!p.getParsers().containsKey(sourceMediaType)) {
      throw new RenditionServiceException(
          "Source mime type of "
              + sourceMimeType
              + " is not supported by Tika for HTML conversions");

    // Make the HTML Version using Tika
    // This will also extract out any images as found
    generateHTML(p, context);
Ejemplo n.º 23
  Check that if Tesseract is not found, the TesseractOCRParser claims to not support
  any file types. So, the standard image parser is called instead.
  public void offersNoTypesIfNotFound() throws Exception {
    TesseractOCRParser parser = new TesseractOCRParser();
    DefaultParser defaultParser = new DefaultParser();
    MediaType png = MediaType.image("png");

    // With an invalid path, will offer no types
    TesseractOCRConfig invalidConfig = new TesseractOCRConfig();

    ParseContext parseContext = new ParseContext();
    parseContext.set(TesseractOCRConfig.class, invalidConfig);

    // No types offered
    assertEquals(0, parser.getSupportedTypes(parseContext).size());

    // And DefaultParser won't use us
    assertEquals(ImageParser.class, defaultParser.getParsers(parseContext).get(png).getClass());
Ejemplo n.º 24
 private static Set<MediaType> mediaTypesListFromDomElement(Element node, String tag)
     throws TikaException, IOException {
   Set<MediaType> types = null;
   NodeList children = node.getChildNodes();
   for (int i = 0; i < children.getLength(); i++) {
     Node cNode = children.item(i);
     if (cNode instanceof Element) {
       Element cElement = (Element) cNode;
       if (tag.equals(cElement.getTagName())) {
         String mime = getText(cElement);
         MediaType type = MediaType.parse(mime);
         if (type != null) {
           if (types == null) types = new HashSet<>();
         } else {
           throw new TikaException("Invalid media type name: " + mime);
   if (types != null) return types;
   return Collections.emptySet();
Ejemplo n.º 25
 public MediaTypeFilter() {
   this.mediaTypes =
       MediaType.set(MediaType.TEXT_HTML, MediaType.TEXT_PLAIN, MediaType.APPLICATION_XML);
Ejemplo n.º 26
 * HTML parser. Uses TagSoup to turn the input document to HTML SAX events, and post-processes the
 * events to produce XHTML and metadata expected by Tika clients.
public class HtmlParser extends AbstractParser {

  /** Serial version UID */
  private static final long serialVersionUID = 7895315240498733128L;

  private static final MediaType XHTML = MediaType.application("xhtml+xml");
  private static final MediaType WAP_XHTML = MediaType.application("vnd.wap.xhtml+xml");
  private static final MediaType X_ASP = MediaType.application("x-asp");

  private static final Set<MediaType> SUPPORTED_TYPES =
          new HashSet<MediaType>(Arrays.asList(MediaType.text("html"), XHTML, WAP_XHTML, X_ASP)));

  private static final ServiceLoader LOADER = new ServiceLoader(HtmlParser.class.getClassLoader());

  /** HTML schema singleton used to amortise the heavy instantiation time. */
  private static final Schema HTML_SCHEMA = new HTMLSchema();

  public Set<MediaType> getSupportedTypes(ParseContext context) {

  public void parse(
      InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
      throws IOException, SAXException, TikaException {
    // Automatically detect the character encoding
    AutoDetectReader reader =
        new AutoDetectReader(
            new CloseShieldInputStream(stream), metadata, context.get(ServiceLoader.class, LOADER));
    try {
      Charset charset = reader.getCharset();
      String previous = metadata.get(Metadata.CONTENT_TYPE);
      MediaType contentType = null;
      if (previous == null || previous.startsWith("text/html")) {
        contentType = new MediaType(MediaType.TEXT_HTML, charset);
      } else if (previous.startsWith("application/xhtml+xml")) {
        contentType = new MediaType(XHTML, charset);
      } else if (previous.startsWith("application/vnd.wap.xhtml+xml")) {
        contentType = new MediaType(WAP_XHTML, charset);
      } else if (previous.startsWith("application/x-asp")) {
        contentType = new MediaType(X_ASP, charset);
      if (contentType != null) {
        metadata.set(Metadata.CONTENT_TYPE, contentType.toString());
      // deprecated, see TIKA-431
      metadata.set(Metadata.CONTENT_ENCODING, charset.name());

      // Get the HTML mapper from the parse context
      HtmlMapper mapper = context.get(HtmlMapper.class, new HtmlParserMapper());

      // Parse the HTML document
      org.ccil.cowan.tagsoup.Parser parser = new org.ccil.cowan.tagsoup.Parser();

      // Use schema from context or default
      Schema schema = context.get(Schema.class, HTML_SCHEMA);

      // TIKA-528: Reuse share schema to avoid heavy instantiation
      parser.setProperty(org.ccil.cowan.tagsoup.Parser.schemaProperty, schema);
      // TIKA-599: Shared schema is thread-safe only if bogons are ignored
      parser.setFeature(org.ccil.cowan.tagsoup.Parser.ignoreBogonsFeature, true);

          new XHTMLDowngradeHandler(new HtmlHandler(mapper, handler, metadata)));

    } finally {

   * Maps "safe" HTML element names to semantic XHTML equivalents. If the given element is unknown
   * or deemed unsafe for inclusion in the parse output, then this method returns <code>null</code>
   * and the element will be ignored but the content inside it is still processed. See the {@link
   * #isDiscardElement(String)} method for a way to discard the entire contents of an element.
   * <p>Subclasses can override this method to customize the default mapping.
   * @param name HTML element name (upper case)
   * @return XHTML element name (lower case), or <code>null</code> if the element is unsafe
   * @since Apache Tika 0.5
   * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This method
   *     will be removed in Tika 1.0.
  protected String mapSafeElement(String name) {
    return DefaultHtmlMapper.INSTANCE.mapSafeElement(name);

   * Checks whether all content within the given HTML element should be discarded instead of
   * including it in the parse output. Subclasses can override this method to customize the set of
   * discarded elements.
   * @param name HTML element name (upper case)
   * @return <code>true</code> if content inside the named element should be ignored, <code>false
   *     </code> otherwise
   * @since Apache Tika 0.5
   * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This method
   *     will be removed in Tika 1.0.
  protected boolean isDiscardElement(String name) {
    return DefaultHtmlMapper.INSTANCE.isDiscardElement(name);

   * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This method
   *     will be removed in Tika 1.0.
  public String mapSafeAttribute(String elementName, String attributeName) {
    return DefaultHtmlMapper.INSTANCE.mapSafeAttribute(elementName, attributeName);

   * Adapter class that maintains backwards compatibility with the protected HtmlParser methods.
   * Making HtmlParser implement HtmlMapper directly would require those methods to be public, which
   * would break backwards compatibility with subclasses.
   * @deprecated Use the {@link HtmlMapper} mechanism to customize the HTML mapping. This class will
   *     be removed in Tika 1.0.
  private class HtmlParserMapper implements HtmlMapper {
    public String mapSafeElement(String name) {
      return HtmlParser.this.mapSafeElement(name);

    public boolean isDiscardElement(String name) {
      return HtmlParser.this.isDiscardElement(name);

    public String mapSafeAttribute(String elementName, String attributeName) {
      return HtmlParser.this.mapSafeAttribute(elementName, attributeName);
  public void load(
      SolrQueryRequest req,
      SolrQueryResponse rsp,
      ContentStream stream,
      UpdateRequestProcessor processor)
      throws Exception {
    Parser parser = null;
    String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
    if (streamType != null) {
      // Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
      MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
      parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
    } else {
      parser = autoDetectParser;
    if (parser != null) {
      Metadata metadata = new Metadata();

      // If you specify the resource name (the filename, roughly) with this parameter,
      // then Tika can make use of it in guessing the appropriate MIME type:
      String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
      if (resourceName != null) {
        metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
      // Provide stream's content type as hint for auto detection
      if (stream.getContentType() != null) {
        metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());

      InputStream inputStream = null;
      try {
        inputStream = stream.getStream();
        metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
        metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
        metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
        metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
        // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
        String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
        if (charset != null) {
          metadata.add(HttpHeaders.CONTENT_ENCODING, charset);

        String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
        boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
        SolrContentHandler handler =
            factory.createSolrContentHandler(metadata, params, req.getSchema());
        ContentHandler parsingHandler = handler;

        StringWriter writer = null;
        BaseMarkupSerializer serializer = null;
        if (extractOnly == true) {
          String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
          writer = new StringWriter();
          if (extractFormat.equals(TEXT_FORMAT)) {
            serializer = new TextSerializer();
            serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
          } else {
            serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
          if (xpathExpr != null) {
            Matcher matcher = PARSER.parse(xpathExpr);
                .startDocument(); // The MatchingContentHandler does not invoke startDocument.  See
                                  // http://tika.markmail.org/message/kknu3hw7argwiqin
            parsingHandler = new MatchingContentHandler(serializer, matcher);
          } else {
            parsingHandler = serializer;
        } else if (xpathExpr != null) {
          Matcher matcher = PARSER.parse(xpathExpr);
          parsingHandler = new MatchingContentHandler(handler, matcher);
        } // else leave it as is

        try {
          // potentially use a wrapper handler for parsing, but we still need the SolrContentHandler
          // for getting the document.
          ParseContext context = parseContextConfig.create();

          context.set(Parser.class, parser);
          context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);

          // Password handling
          RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
          String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
          if (pwMapFile != null && pwMapFile.length() > 0) {
            InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
            if (is != null) {
              log.debug("Password file supplied: " + pwMapFile);
          context.set(PasswordProvider.class, epp);
          String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
          if (resourcePassword != null) {
            log.debug("Literal password supplied for file " + resourceName);
          parser.parse(inputStream, parsingHandler, metadata, context);
        } catch (TikaException e) {
          if (ignoreTikaException)
                new StringBuilder("skip extracting text due to ")
                    .append(". metadata=")
          else throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        if (extractOnly == false) {
        } else {
          // serializer is not null, so we need to call endDoc on it if using xpath
          if (xpathExpr != null) {
          rsp.add(stream.getName(), writer.toString());
          String[] names = metadata.names();
          NamedList metadataNL = new NamedList();
          for (int i = 0; i < names.length; i++) {
            String[] vals = metadata.getValues(names[i]);
            metadataNL.add(names[i], vals);
          rsp.add(stream.getName() + "_metadata", metadataNL);
      } catch (SAXException e) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
      } finally {
    } else {
      throw new SolrException(
          "Stream type of "
              + streamType
              + " didn't match any known parsers.  Please supply the "
              + ExtractingParams.STREAM_TYPE
              + " parameter.");
Ejemplo n.º 28
    public void parseEmbedded(
        InputStream inputStream,
        ContentHandler contentHandler,
        Metadata metadata,
        boolean outputHtml)
        throws SAXException, IOException {
      String name = metadata.get(Metadata.RESOURCE_NAME_KEY);

      if (name == null) {
        name = "file" + count++;

      MediaType contentType = detector.detect(inputStream, metadata);

      if (name.indexOf('.') == -1 && contentType != null) {
        try {
          name += config.getMimeRepository().forName(contentType.toString()).getExtension();
        } catch (MimeTypeException e) {

      String relID = metadata.get(Metadata.EMBEDDED_RELATIONSHIP_ID);
      if (relID != null && !name.startsWith(relID)) {
        name = relID + "_" + name;

      File outputFile = new File(extractDir, name);
      File parent = outputFile.getParentFile();
      if (!parent.exists()) {
        if (!parent.mkdirs()) {
          throw new IOException("unable to create directory \"" + parent + "\"");
      System.out.println("Extracting '" + name + "' (" + contentType + ") to " + outputFile);

      FileOutputStream os = null;

      try {
        os = new FileOutputStream(outputFile);

        if (inputStream instanceof TikaInputStream) {
          TikaInputStream tin = (TikaInputStream) inputStream;

          if (tin.getOpenContainer() != null && tin.getOpenContainer() instanceof DirectoryEntry) {
            POIFSFileSystem fs = new POIFSFileSystem();
            copy((DirectoryEntry) tin.getOpenContainer(), fs.getRoot());
          } else {
            IOUtils.copy(inputStream, os);
        } else {
          IOUtils.copy(inputStream, os);
      } catch (Exception e) {
        logger.warn("Ignoring unexpected exception trying to save embedded file " + name, e);
      } finally {
        if (os != null) {
Ejemplo n.º 29
 * Pulls together {@link IPortalDataType}, {@link IDataUpgrader}, and {@link IDataImporter}
 * implementations to handle data upgrade, import, export and removal operations.
 * @author Eric Dalquist
public class JaxbPortalDataHandlerService implements IPortalDataHandlerService {

  /** Tracks the base import directory to allow for easier to read logging when importing */
  private static final ThreadLocal<String> IMPORT_BASE_DIR = new ThreadLocal<String>();

  private static final String REPORT_FORMAT = "%s,%s,%.2fms\n";

  private static final MediaType MT_JAVA_ARCHIVE = MediaType.application("java-archive");
  private static final MediaType MT_CPIO = MediaType.application("x-cpio");
  private static final MediaType MT_AR = MediaType.application("x-archive");
  private static final MediaType MT_TAR = MediaType.application("x-tar");
  private static final MediaType MT_BZIP2 = MediaType.application("x-bzip2");
  private static final MediaType MT_GZIP = MediaType.application("x-gzip");
  private static final MediaType MT_PACK200 = MediaType.application("x-java-pack200");
  private static final MediaType MT_XZ = MediaType.application("x-xz");

  protected final Logger logger = LoggerFactory.getLogger(getClass());

  // Order in which data must be imported
  private List<PortalDataKey> dataKeyImportOrder = Collections.emptyList();
  // Map to lookup the associated IPortalDataType for each known PortalDataKey
  private Map<PortalDataKey, IPortalDataType> dataKeyTypes = Collections.emptyMap();

  // Ant path matcher patterns that a file must match when scanning directories (unless a pattern is
  // explicitly specified)
  private Set<String> dataFileIncludes = Collections.emptySet();
  private Set<String> dataFileExcludes = ImmutableSet.copyOf(DirectoryScanner.getDefaultExcludes());

  // Data upgraders mapped by PortalDataKey
  private Map<PortalDataKey, IDataUpgrader> portalDataUpgraders = Collections.emptyMap();
  // Data importers mapped by PortalDataKey
  private Map<PortalDataKey, IDataImporter<Object>> portalDataImporters = Collections.emptyMap();

  // ExportAll data types
  private Set<IPortalDataType> exportAllPortalDataTypes = null;
  // All portal data types available for export
  private Set<IPortalDataType> exportPortalDataTypes = Collections.emptySet();
  // Data exporters mapped by IPortalDateType#getTypeId()
  private Map<String, IDataExporter<Object>> portalDataExporters = Collections.emptyMap();

  // All portal data types available for delete
  private Set<IPortalDataType> deletePortalDataTypes = Collections.emptySet();
  // Data deleters mapped by IPortalDateType#getTypeId()
  private Map<String, IDataDeleter<Object>> portalDataDeleters = Collections.emptyMap();

  private org.apereo.portal.utils.DirectoryScanner directoryScanner;
  private ExecutorService importExportThreadPool;
  private XmlUtilities xmlUtilities;

  private long maxWait = -1;
  private TimeUnit maxWaitTimeUnit = TimeUnit.MILLISECONDS;

  public void setXmlUtilities(XmlUtilities xmlUtilities) {
    this.xmlUtilities = xmlUtilities;

  public void setImportExportThreadPool(
      @Qualifier("importExportThreadPool") ExecutorService importExportThreadPool) {
    this.importExportThreadPool = importExportThreadPool;
    this.directoryScanner = new ConcurrentDirectoryScanner(this.importExportThreadPool);

  /** Maximum time to wait for an import, export, or delete to execute. */
  public void setMaxWait(long maxWait) {
    this.maxWait = maxWait;

  /** {@link TimeUnit} for {@link #setMaxWait(long)} value. */
  public void setMaxWaitTimeUnit(TimeUnit maxWaitTimeUnit) {
    this.maxWaitTimeUnit = maxWaitTimeUnit;

  /** Order in which data types should be imported. */
  @javax.annotation.Resource(name = "dataTypeImportOrder")
  public void setDataTypeImportOrder(List<IPortalDataType> dataTypeImportOrder) {
    final ArrayList<PortalDataKey> dataKeyImportOrder =
        new ArrayList<PortalDataKey>(dataTypeImportOrder.size() * 2);
    final Map<PortalDataKey, IPortalDataType> dataKeyTypes =
        new LinkedHashMap<PortalDataKey, IPortalDataType>(dataTypeImportOrder.size() * 2);

    for (final IPortalDataType portalDataType : dataTypeImportOrder) {
      final List<PortalDataKey> supportedDataKeys = portalDataType.getDataKeyImportOrder();
      for (final PortalDataKey portalDataKey : supportedDataKeys) {
        dataKeyTypes.put(portalDataKey, portalDataType);

    this.dataKeyImportOrder = Collections.unmodifiableList(dataKeyImportOrder);
    this.dataKeyTypes = Collections.unmodifiableMap(dataKeyTypes);

  /** Ant path matching patterns that files must match to be included */
  @javax.annotation.Resource(name = "dataFileIncludes")
  public void setDataFileIncludes(Set<String> dataFileIncludes) {
    this.dataFileIncludes = dataFileIncludes;

   * Ant path matching patterns that exclude matched files. Defaults to {@link
   * DirectoryScanner#addDefaultExcludes()}
  public void setDataFileExcludes(Set<String> dataFileExcludes) {
    this.dataFileExcludes = dataFileExcludes;

  /** {@link IDataImporter} implementations to delegate import operations to. */
  @Autowired(required = false)
  public void setDataImporters(Collection<IDataImporter<? extends Object>> dataImporters) {
    final Map<PortalDataKey, IDataImporter<Object>> dataImportersMap =
        new LinkedHashMap<PortalDataKey, IDataImporter<Object>>();

    for (final IDataImporter<?> dataImporter : dataImporters) {

      try {

        final Set<PortalDataKey> importDataKeys = dataImporter.getImportDataKeys();

        for (final PortalDataKey importDataKey : importDataKeys) {
              "Registering IDataImporter for '{}' - {}",
              new Object[] {importDataKey, dataImporter});
          final IDataImporter<Object> existing =
              dataImportersMap.put(importDataKey, (IDataImporter<Object>) dataImporter);
          if (existing != null) {
                "Duplicate IDataImporter PortalDataKey for {} Replacing {} with {}",
                new Object[] {importDataKey, existing, dataImporter});

      } catch (Exception exception) {
        logger.error("Failed to register data importer {}.", dataImporter, exception);

    this.portalDataImporters = Collections.unmodifiableMap(dataImportersMap);

  /** {@link IDataExporter} implementations to delegate export operations to. */
  @Autowired(required = false)
  public void setDataExporters(Collection<IDataExporter<? extends Object>> dataExporters) {
    final Map<String, IDataExporter<Object>> dataExportersMap =
        new LinkedHashMap<String, IDataExporter<Object>>();

    final Set<IPortalDataType> portalDataTypes = new LinkedHashSet<IPortalDataType>();

    for (final IDataExporter<?> dataExporter : dataExporters) {

      try {

        final IPortalDataType portalDataType = dataExporter.getPortalDataType();
        final String typeId = portalDataType.getTypeId();

            "Registering IDataExporter for '{}' - {}", new Object[] {typeId, dataExporter});
        final IDataExporter<Object> existing =
            dataExportersMap.put(typeId, (IDataExporter<Object>) dataExporter);
        if (existing != null) {
              "Duplicate IDataExporter typeId for {} Replacing {} with {}",
              new Object[] {typeId, existing, dataExporter});


      } catch (Exception exception) {
        logger.error("Failed to register data exporter {}.", dataExporter, exception);

    this.portalDataExporters = Collections.unmodifiableMap(dataExportersMap);
    this.exportPortalDataTypes = Collections.unmodifiableSet(portalDataTypes);

   * Optional set of all portal data types to export. If not specified all available portal data
   * types will be listed.
  @javax.annotation.Resource(name = "exportAllPortalDataTypes")
  public void setExportAllPortalDataTypes(Set<IPortalDataType> exportAllPortalDataTypes) {
    this.exportAllPortalDataTypes = ImmutableSet.copyOf(exportAllPortalDataTypes);

  /** {@link IDataDeleter} implementations to delegate delete operations to. */
  @Autowired(required = false)
  public void setDataDeleters(Collection<IDataDeleter<? extends Object>> dataDeleters) {
    final Map<String, IDataDeleter<Object>> dataDeletersMap =
        new LinkedHashMap<String, IDataDeleter<Object>>();

    final Set<IPortalDataType> portalDataTypes = new LinkedHashSet<IPortalDataType>();

    for (final IDataDeleter<?> dataDeleter : dataDeleters) {

      try {

        final IPortalDataType portalDataType = dataDeleter.getPortalDataType();
        final String typeId = portalDataType.getTypeId();

            "Registering IDataDeleter for '{}' - {}", new Object[] {typeId, dataDeleter});
        final IDataDeleter<Object> existing =
            dataDeletersMap.put(typeId, (IDataDeleter<Object>) dataDeleter);
        if (existing != null) {
              "Duplicate IDataDeleter typeId for {} Replacing {} with {}",
              new Object[] {typeId, existing, dataDeleter});


      } catch (Exception exception) {
        logger.error("Failed to register data deleter {}.", dataDeleter, exception);

    this.portalDataDeleters = Collections.unmodifiableMap(dataDeletersMap);
    this.deletePortalDataTypes = Collections.unmodifiableSet(portalDataTypes);

  /** {@link IDataUpgrader} implementations to delegate upgrade operations to. */
  @Autowired(required = false)
  public void setDataUpgraders(Collection<IDataUpgrader> dataUpgraders) {
    final Map<PortalDataKey, IDataUpgrader> dataUpgraderMap =
        new LinkedHashMap<PortalDataKey, IDataUpgrader>();

    for (final IDataUpgrader dataUpgrader : dataUpgraders) {

      try {

        final Set<PortalDataKey> upgradeDataKeys = dataUpgrader.getSourceDataTypes();
        for (final PortalDataKey upgradeDataKey : upgradeDataKeys) {
              "Registering IDataUpgrader for '{}' - {}", upgradeDataKey, dataUpgrader);
          final IDataUpgrader existing = dataUpgraderMap.put(upgradeDataKey, dataUpgrader);
          if (existing != null) {
                "Duplicate IDataUpgrader PortalDataKey for {} Replacing {} with {}",
                new Object[] {upgradeDataKey, existing, dataUpgrader});

      } catch (Exception exception) {
        logger.error("Failed to register data upgrader {}.", dataUpgrader, exception);

    this.portalDataUpgraders = Collections.unmodifiableMap(dataUpgraderMap);

  public void importDataArchive(Resource archive, BatchImportOptions options) {
    try {
      importDataArchive(archive, archive.getInputStream(), options);
    } catch (IOException e) {
      throw new RuntimeException("Could not load InputStream for resource: " + archive, e);

  protected void importDataArchive(
      Resource archive, InputStream resourceStream, BatchImportOptions options) {
    BufferedInputStream bufferedResourceStream = null;
    try {
      // Make sure the stream is buffered
      if (resourceStream instanceof BufferedInputStream) {
        bufferedResourceStream = (BufferedInputStream) resourceStream;
      } else {
        bufferedResourceStream = new BufferedInputStream(resourceStream);

      // Buffer up to 100MB, bad things will happen if we bust this buffer.
      // TODO see if there is a buffered stream that will write to a file once the buffer fills up
      bufferedResourceStream.mark(100 * 1024 * 1024);
      final MediaType type = getMediaType(bufferedResourceStream, archive.getFilename());

      if (MT_JAVA_ARCHIVE.equals(type)) {
        final ArchiveInputStream archiveStream = new JarArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MediaType.APPLICATION_ZIP.equals(type)) {
        final ArchiveInputStream archiveStream = new ZipArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_CPIO.equals(type)) {
        final ArchiveInputStream archiveStream = new CpioArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_AR.equals(type)) {
        final ArchiveInputStream archiveStream = new ArArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_TAR.equals(type)) {
        final ArchiveInputStream archiveStream = new TarArchiveInputStream(bufferedResourceStream);
        importDataArchive(archive, archiveStream, options);
      } else if (MT_BZIP2.equals(type)) {
        final CompressorInputStream compressedStream =
            new BZip2CompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else if (MT_GZIP.equals(type)) {
        final CompressorInputStream compressedStream =
            new GzipCompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else if (MT_PACK200.equals(type)) {
        final CompressorInputStream compressedStream =
            new Pack200CompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else if (MT_XZ.equals(type)) {
        final CompressorInputStream compressedStream =
            new XZCompressorInputStream(bufferedResourceStream);
        importDataArchive(archive, compressedStream, options);
      } else {
        throw new RuntimeException("Unrecognized archive media type: " + type);
    } catch (IOException e) {
      throw new RuntimeException("Could not load InputStream for resource: " + archive, e);
    } finally {

  /** Extracts the archive resource and then runs the batch-import process on it. */
  protected void importDataArchive(
      final Resource resource,
      final ArchiveInputStream resourceStream,
      BatchImportOptions options) {

    final File tempDir = Files.createTempDir();
    try {
      ArchiveEntry archiveEntry;
      while ((archiveEntry = resourceStream.getNextEntry()) != null) {
        final File entryFile = new File(tempDir, archiveEntry.getName());
        if (archiveEntry.isDirectory()) {
        } else {

              new InputSupplier<InputStream>() {
                public InputStream getInput() throws IOException {
                  return new CloseShieldInputStream(resourceStream);

      importDataDirectory(tempDir, null, options);
    } catch (IOException e) {
      throw new RuntimeException(
          "Failed to extract data from '" + resource + "' to '" + tempDir + "' for batch import.",
    } finally {

  protected MediaType getMediaType(BufferedInputStream inputStream, String fileName)
      throws IOException {
    final TikaInputStream tikaInputStreamStream =
        TikaInputStream.get(new CloseShieldInputStream(inputStream));
    try {
      final Detector detector = new DefaultDetector();
      final Metadata metadata = new Metadata();
      metadata.set(Metadata.RESOURCE_NAME_KEY, fileName);

      final MediaType type = detector.detect(tikaInputStreamStream, metadata);
      logger.debug("Determined '{}' for '{}'", type, fileName);
      return type;
    } catch (IOException e) {
      logger.warn("Failed to determine media type for '" + fileName + "' assuming XML", e);
      return null;
    } finally {

      // Reset the buffered stream to make up for anything read by the detector

  public void importDataDirectory(
      File directory, String pattern, final BatchImportOptions options) {
    if (!directory.exists()) {
      throw new IllegalArgumentException(
          "The specified directory '" + directory + "' does not exist");

    // Create the file filter to use when searching for files to import
    final FileFilter fileFilter;
    if (pattern != null) {
      fileFilter = new AntPatternFileFilter(true, false, pattern, this.dataFileExcludes);
    } else {
      fileFilter =
          new AntPatternFileFilter(true, false, this.dataFileIncludes, this.dataFileExcludes);

    // Determine the parent directory to log to
    final File logDirectory = determineLogDirectory(options, "import");

    // Setup reporting file
    final File importReport = new File(logDirectory, "data-import.txt");
    final PrintWriter reportWriter;
    try {
      reportWriter =
          new PrintWriter(new PeriodicFlushingBufferedWriter(500, new FileWriter(importReport)));
    } catch (IOException e) {
      throw new RuntimeException("Failed to create FileWriter for: " + importReport, e);

    // Convert directory to URI String to provide better logging output
    final URI directoryUri = directory.toURI();
    final String directoryUriStr = directoryUri.toString();
    try {
      // Scan the specified directory for files to import
      logger.info("Scanning for files to Import from: {}", directory);
      final PortalDataKeyFileProcessor fileProcessor =
          new PortalDataKeyFileProcessor(this.dataKeyTypes, options);
      this.directoryScanner.scanDirectoryNoResults(directory, fileFilter, fileProcessor);
      final long resourceCount = fileProcessor.getResourceCount();
      logger.info("Found {} files to Import from: {}", resourceCount, directory);

      // See if the import should fail on error
      final boolean failOnError = options != null ? options.isFailOnError() : true;

      // Map of files to import, grouped by type
      final ConcurrentMap<PortalDataKey, Queue<Resource>> dataToImport =

      // Import the data files
      for (final PortalDataKey portalDataKey : this.dataKeyImportOrder) {
        final Queue<Resource> files = dataToImport.remove(portalDataKey);
        if (files == null) {

        final Queue<ImportFuture<?>> importFutures = new LinkedList<ImportFuture<?>>();
        final List<FutureHolder<?>> failedFutures = new LinkedList<FutureHolder<?>>();

        final int fileCount = files.size();
        logger.info("Importing {} files of type {}", fileCount, portalDataKey);
        reportWriter.println(portalDataKey + "," + fileCount);

        while (!files.isEmpty()) {
          final Resource file = files.poll();

          // Check for completed futures on every iteration, needed to fail as fast as possible on
          // an import exception
          final List<FutureHolder<?>> newFailed =
              waitForFutures(importFutures, reportWriter, logDirectory, false);

          final AtomicLong importTime = new AtomicLong(-1);

          // Create import task
          final Callable<Object> task =
              new CallableWithoutResult() {
                protected void callWithoutResult() {
                  try {
                    importData(file, portalDataKey);
                  } finally {
                    importTime.set(System.nanoTime() - importTime.get());

          // Submit the import task
          final Future<?> importFuture = this.importExportThreadPool.submit(task);

          // Add the future for tracking
          importFutures.offer(new ImportFuture(importFuture, file, portalDataKey, importTime));

        // Wait for all of the imports on of this type to complete
        final List<FutureHolder<?>> newFailed =
            waitForFutures(importFutures, reportWriter, logDirectory, true);

        if (failOnError && !failedFutures.isEmpty()) {
          throw new RuntimeException(
                  + " "
                  + portalDataKey
                  + " entities failed to import.\n\n"
                  + "\tPer entity exception logs and a full report can be found in "
                  + logDirectory
                  + "\n");


      if (!dataToImport.isEmpty()) {
        throw new IllegalStateException(
            "The following PortalDataKeys are not listed in the dataTypeImportOrder List: "
                + dataToImport.keySet());

      logger.info("For a detailed report on the data import see " + importReport);
    } catch (InterruptedException e) {
      throw new RuntimeException("Interrupted while waiting for entities to import", e);
    } finally {

  /** Determine directory to log import/export reports to */
  private File determineLogDirectory(final BatchOptions options, String operation) {
    File logDirectoryParent = options != null ? options.getLogDirectoryParent() : null;
    if (logDirectoryParent == null) {
      logDirectoryParent = Files.createTempDir();
    File logDirectory = new File(logDirectoryParent, "data-" + operation + "-reports");
    try {
      logDirectory = logDirectory.getCanonicalFile();
    } catch (IOException e) {
      throw new RuntimeException(
          "Failed to clean data-" + operation + " log directory: " + logDirectory, e);
    return logDirectory;

  public void importData(final Resource resource) {
    this.importData(resource, null);

  public void importData(Source source) {
    this.importData(source, null);

  public final void importData(final Source source, PortalDataKey portalDataKey) {
    // Get a StAX reader for the source to determine info about the data to import
    final BufferedXMLEventReader bufferedXmlEventReader = createSourceXmlEventReader(source);

    // If no PortalDataKey was passed build it from the source
    if (portalDataKey == null) {
      final StartElement rootElement = StaxUtils.getRootElement(bufferedXmlEventReader);
      portalDataKey = new PortalDataKey(rootElement);

    final String systemId = source.getSystemId();

    // Post Process the PortalDataKey to see if more complex import operations are needed
    final IPortalDataType portalDataType = this.dataKeyTypes.get(portalDataKey);
    if (portalDataType == null) {
      throw new RuntimeException(
          "No IPortalDataType configured for "
              + portalDataKey
              + ", the resource will be ignored: "
              + getPartialSystemId(systemId));
    final Set<PortalDataKey> postProcessedPortalDataKeys =
        portalDataType.postProcessPortalDataKey(systemId, portalDataKey, bufferedXmlEventReader);

    // If only a single result from post processing import
    if (postProcessedPortalDataKeys.size() == 1) {
    // If multiple results from post processing ordering is needed
    else {
      // Iterate over the data key order list to run the imports in the correct order
      for (final PortalDataKey orderedPortalDataKey : this.dataKeyImportOrder) {
        if (postProcessedPortalDataKeys.contains(orderedPortalDataKey)) {
          // Reset the to start of the XML document for each import/upgrade call
          this.importOrUpgradeData(systemId, orderedPortalDataKey, bufferedXmlEventReader);

   * @param portalDataKey Optional PortalDataKey to use, useful for batch imports where
   *     post-processing of keys has already take place
  protected final void importData(final Resource resource, final PortalDataKey portalDataKey) {
    final InputStream resourceStream;
    try {
      resourceStream = resource.getInputStream();
    } catch (IOException e) {
      throw new RuntimeException("Could not load InputStream for resource: " + resource, e);

    try {
      final String resourceUri = ResourceUtils.getResourceUri(resource);
      this.importData(new StreamSource(resourceStream, resourceUri), portalDataKey);
    } finally {

  protected String getPartialSystemId(String systemId) {
    final String directoryUriStr = IMPORT_BASE_DIR.get();
    if (directoryUriStr == null) {
      return systemId;

    if (systemId.startsWith(directoryUriStr)) {
      return systemId.substring(directoryUriStr.length());

    return systemId;

  /** Run the import/update process on the data */
  protected final void importOrUpgradeData(
      String systemId, PortalDataKey portalDataKey, XMLEventReader xmlEventReader) {
    // See if there is a registered importer for the data, if so import
    final IDataImporter<Object> dataImporterExporter = this.portalDataImporters.get(portalDataKey);
    if (dataImporterExporter != null) {
      this.logger.debug("Importing: {}", getPartialSystemId(systemId));
      final Object data = unmarshallData(xmlEventReader, dataImporterExporter);
      this.logger.info("Imported : {}", getPartialSystemId(systemId));

    // No importer, see if there is an upgrader, if so upgrade
    final IDataUpgrader dataUpgrader = this.portalDataUpgraders.get(portalDataKey);
    if (dataUpgrader != null) {
      this.logger.debug("Upgrading: {}", getPartialSystemId(systemId));

      // Convert the StAX stream to a DOM node, due to poor JDK support for StAX with XSLT
      final Node sourceNode;
      try {
        sourceNode = xmlUtilities.convertToDom(xmlEventReader);
      } catch (XMLStreamException e) {
        throw new RuntimeException("Failed to create StAXSource from original XML reader", e);
      final DOMSource source = new DOMSource(sourceNode);

      final DOMResult result = new DOMResult();
      final boolean doImport = dataUpgrader.upgradeData(source, result);
      if (doImport) {
        // If the upgrader didn't handle the import as well wrap the result DOM in a new Source and
        // start the import process over again
        final org.w3c.dom.Node node = result.getNode();
        final PortalDataKey upgradedPortalDataKey = new PortalDataKey(node);
        if (this.logger.isTraceEnabled()) {
              "Upgraded: "
                  + getPartialSystemId(systemId)
                  + " to "
                  + upgradedPortalDataKey
                  + "\n\nSource XML: \n"
                  + XmlUtilitiesImpl.toString(source.getNode())
                  + "\n\nResult XML: \n"
                  + XmlUtilitiesImpl.toString(node));
        } else {
              "Upgraded: {} to {}", getPartialSystemId(systemId), upgradedPortalDataKey);
        final DOMSource upgradedSource = new DOMSource(node, systemId);
        this.importData(upgradedSource, upgradedPortalDataKey);
      } else {
        this.logger.info("Upgraded and Imported: {}", getPartialSystemId(systemId));

    // No importer or upgrader found, fail
    throw new IllegalArgumentException(
        "Provided data "
            + portalDataKey
            + " has no registered importer or upgrader support: "
            + systemId);

  protected Object unmarshallData(
      final XMLEventReader bufferedXmlEventReader,
      final IDataImporter<Object> dataImporterExporter) {
    final Unmarshaller unmarshaller = dataImporterExporter.getUnmarshaller();

    try {
      final StAXSource source = new StAXSource(bufferedXmlEventReader);
      return unmarshaller.unmarshal(source);
    } catch (XmlMappingException e) {
      throw new RuntimeException("Failed to map provided XML to portal data", e);
    } catch (IOException e) {
      throw new RuntimeException("Failed to read the provided XML data", e);
    } catch (XMLStreamException e) {
      throw new RuntimeException("Failed to create StAX Source to read XML data", e);

  protected BufferedXMLEventReader createSourceXmlEventReader(final Source source) {
    // If it is a StAXSource see if we can do better handling of it
    if (source instanceof StAXSource) {
      final StAXSource staxSource = (StAXSource) source;
      XMLEventReader xmlEventReader = staxSource.getXMLEventReader();
      if (xmlEventReader != null) {
        if (xmlEventReader instanceof BufferedXMLEventReader) {
          final BufferedXMLEventReader bufferedXMLEventReader =
              (BufferedXMLEventReader) xmlEventReader;
          return bufferedXMLEventReader;

        return new BufferedXMLEventReader(xmlEventReader, -1);

    final XMLInputFactory xmlInputFactory = this.xmlUtilities.getXmlInputFactory();
    final XMLEventReader xmlEventReader;
    try {
      xmlEventReader = xmlInputFactory.createXMLEventReader(source);
    } catch (XMLStreamException e) {
      throw new RuntimeException("Failed to create XML Event Reader for data Source", e);
    return new BufferedXMLEventReader(xmlEventReader, -1);

  public Iterable<IPortalDataType> getExportPortalDataTypes() {
    return this.exportPortalDataTypes;

  public Iterable<IPortalDataType> getDeletePortalDataTypes() {
    return this.deletePortalDataTypes;

  public Iterable<? extends IPortalData> getPortalData(String typeId) {
    final IDataExporter<Object> dataImporterExporter = getPortalDataExporter(typeId);
    return dataImporterExporter.getPortalData();

  public String exportData(String typeId, String dataId, Result result) {
    final IDataExporter<Object> portalDataExporter = this.getPortalDataExporter(typeId);
    final Object data = portalDataExporter.exportData(dataId);
    if (data == null) {
      return null;

    final Marshaller marshaller = portalDataExporter.getMarshaller();
    try {
      marshaller.marshal(data, result);
      return portalDataExporter.getFileName(data);
    } catch (XmlMappingException e) {
      throw new RuntimeException("Failed to map provided portal data to XML", e);
    } catch (IOException e) {
      throw new RuntimeException("Failed to write the provided XML data", e);

  public boolean exportData(String typeId, String dataId, File directory) {

    final File exportTempFile;
    try {
      exportTempFile =
              SafeFilenameUtils.makeSafeFilename(StringUtils.rightPad(dataId, 2, '-') + "-"),
              SafeFilenameUtils.makeSafeFilename("." + typeId),
    } catch (IOException e) {
      throw new RuntimeException(
          "Could not create temp file to export " + typeId + " " + dataId, e);

    try {
      final String fileName = this.exportData(typeId, dataId, new StreamResult(exportTempFile));
      if (fileName == null) {
        logger.info("Skipped: type={} id={}", typeId, dataId);
        return false;

      final File destFile = new File(directory, fileName + "." + typeId + ".xml");
      if (destFile.exists()) {
            "Exporting "
                + typeId
                + " "
                + dataId
                + " but destination file already exists, it will be overwritten: "
                + destFile);
      FileUtils.moveFile(exportTempFile, destFile);
      logger.info("Exported: {}", destFile);

      return true;
    } catch (Exception e) {
      if (e instanceof RuntimeException) {
        throw (RuntimeException) e;

      throw new RuntimeException("Failed to export " + typeId + " " + dataId, e);
    } finally {

  public void exportAllDataOfType(Set<String> typeIds, File directory, BatchExportOptions options) {
    final Queue<ExportFuture<?>> exportFutures = new ConcurrentLinkedQueue<ExportFuture<?>>();
    final boolean failOnError = options != null ? options.isFailOnError() : true;

    // Determine the parent directory to log to
    final File logDirectory = determineLogDirectory(options, "export");

    // Setup reporting file
    final File exportReport = new File(logDirectory, "data-export.txt");
    final PrintWriter reportWriter;
    try {
      reportWriter = new PrintWriter(new BufferedWriter(new FileWriter(exportReport)));
    } catch (IOException e) {
      throw new RuntimeException("Failed to create FileWriter for: " + exportReport, e);

    try {
      for (final String typeId : typeIds) {
        final List<FutureHolder<?>> failedFutures = new LinkedList<FutureHolder<?>>();

        final File typeDir = new File(directory, typeId);
        logger.info("Adding all data of type {} to export queue: {}", typeId, typeDir);

        reportWriter.println(typeId + "," + typeDir);

        final Iterable<? extends IPortalData> dataForType = this.getPortalData(typeId);
        for (final IPortalData data : dataForType) {
          final String dataId = data.getDataId();

          // Check for completed futures on every iteration, needed to fail as fast as possible on
          // an import exception
          final List<FutureHolder<?>> newFailed =
              waitForFutures(exportFutures, reportWriter, logDirectory, false);

          final AtomicLong exportTime = new AtomicLong(-1);

          // Create export task
          Callable<Object> task =
              new CallableWithoutResult() {
                protected void callWithoutResult() {
                  try {
                    exportData(typeId, dataId, typeDir);
                  } finally {
                    exportTime.set(System.nanoTime() - exportTime.get());

          // Submit the export task
          final Future<?> exportFuture = this.importExportThreadPool.submit(task);

          // Add the future for tracking
          final ExportFuture futureHolder =
              new ExportFuture(exportFuture, typeId, dataId, exportTime);

        final List<FutureHolder<?>> newFailed =
            waitForFutures(exportFutures, reportWriter, logDirectory, true);


        if (failOnError && !failedFutures.isEmpty()) {
          throw new RuntimeException(
                  + " "
                  + typeId
                  + " entities failed to export.\n"
                  + "\tPer entity exception logs and a full report can be found in "
                  + logDirectory);
    } catch (InterruptedException e) {
      throw new RuntimeException("Interrupted while waiting for entities to export", e);
    } finally {

  public void exportAllData(File directory, BatchExportOptions options) {
    final Set<IPortalDataType> portalDataTypes;
    if (this.exportAllPortalDataTypes != null) {
      portalDataTypes = this.exportAllPortalDataTypes;
    } else {
      portalDataTypes = this.exportPortalDataTypes;

    final Set<String> typeIds = new LinkedHashSet<String>();
    for (final IPortalDataType portalDataType : portalDataTypes) {
    this.exportAllDataOfType(typeIds, directory, options);

  protected IDataExporter<Object> getPortalDataExporter(String typeId) {
    final IDataExporter<Object> dataExporter = this.portalDataExporters.get(typeId);
    if (dataExporter == null) {
      throw new IllegalArgumentException("No IDataExporter exists for: " + typeId);
    return dataExporter;

  public void deleteData(String typeId, String dataId) {
    final IDataDeleter<Object> dataDeleter = this.portalDataDeleters.get(typeId);
    if (dataDeleter == null) {
      throw new IllegalArgumentException("No IDataDeleter exists for: " + typeId);

    final Object data = dataDeleter.deleteData(dataId);
    if (data != null) {
      logger.info("Deleted data " + dataId + " of type " + typeId);
    } else {
      logger.info("No data " + dataId + " of type " + typeId + " exists to delete");

   * Used by batch import and export to wait for queued tasks to complete. Handles fail-fast
   * behavior if any of the tasks threw and exception by canceling all queued futures and logging a
   * summary of the failures. All completed futures are removed from the queue.
   * @param futures Queued futures to check for completeness
   * @param wait If true it will wait for all futures to complete, if false only check for completed
   *     futures
   * @return a list of futures that either threw exceptions or timed out
  protected List<FutureHolder<?>> waitForFutures(
      final Queue<? extends FutureHolder<?>> futures,
      final PrintWriter reportWriter,
      final File reportDirectory,
      final boolean wait)
      throws InterruptedException {

    final List<FutureHolder<?>> failedFutures = new LinkedList<FutureHolder<?>>();

    for (Iterator<? extends FutureHolder<?>> futuresItr = futures.iterator();
        futuresItr.hasNext(); ) {
      final FutureHolder<?> futureHolder = futuresItr.next();

      // If waiting, or if not waiting but the future is already done do the get
      final Future<?> future = futureHolder.getFuture();
      if (wait || (!wait && future.isDone())) {

        try {
          // Don't bother doing a get() on canceled futures
          if (!future.isCancelled()) {
            if (this.maxWait > 0) {
              future.get(this.maxWait, this.maxWaitTimeUnit);
            } else {

        } catch (CancellationException e) {
          // Ignore cancellation exceptions
        } catch (ExecutionException e) {
          logger.error("Failed: " + futureHolder);


          try {
            final String dataReportName =
                    futureHolder.getDataType() + "_" + futureHolder.getDataName() + ".txt");
            final File dataReportFile = new File(reportDirectory, dataReportName);
            final PrintWriter dataReportWriter =
                new PrintWriter(new BufferedWriter(new FileWriter(dataReportFile)));
            try {
                  "FAIL: " + futureHolder.getDataType() + " - " + futureHolder.getDataName());
            } finally {
          } catch (Exception re) {
                "Failed to write error report for failed "
                    + futureHolder
                    + ", logging root failure here",
        } catch (TimeoutException e) {
          logger.warn("Failed: " + futureHolder);


    return failedFutures;

  private abstract static class FutureHolder<T> {
    private final Future<T> future;
    private final AtomicLong time;
    private Exception error;

    public FutureHolder(Future<T> future, AtomicLong time) {
      this.future = future;
      this.time = time;

    public Future<T> getFuture() {
      return this.future;

    public double getExecutionTimeMillis() {
      final long t = time.get();
      if (!future.isDone()) {
        return System.nanoTime() - t;
      return t / 1000000.0;

    public Exception getError() {
      return error;

    public void setError(Exception error) {
      this.error = error;

    public abstract String getDescription();

    public abstract String getDataType();

    public abstract String getDataName();

  private static class ImportFuture<T> extends FutureHolder<T> {
    private final Resource resource;
    private final PortalDataKey dataKey;

    public ImportFuture(
        Future<T> future, Resource resource, PortalDataKey dataKey, AtomicLong importTime) {
      super(future, importTime);
      this.resource = resource;
      this.dataKey = dataKey;

    public String getDescription() {
      return this.resource.getDescription();

    public String getDataType() {
      return dataKey.getName().getLocalPart();

    public String getDataName() {
      return this.resource.getFilename();

    public String toString() {
      return "importing " + this.getDescription();

  private static class ExportFuture<T> extends FutureHolder<T> {
    private final String typeId;
    private final String dataId;

    public ExportFuture(Future<T> future, String typeId, String dataId, AtomicLong exportTime) {
      super(future, exportTime);
      this.typeId = typeId;
      this.dataId = dataId;

    public String getDescription() {
      return "type=" + this.typeId + ", dataId=" + this.dataId;

    public String getDataType() {
      return this.typeId;

    public String getDataName() {
      return this.dataId;

    public String toString() {
      return "exporting " + this.getDescription();
Ejemplo n.º 30
  public ParseResult getParse(Content content) {
    String mimeType = content.getContentType();

    URL base;
    try {
      base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
      return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());

    // get the right parser using the mime type as a clue
    Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
    byte[] raw = content.getContent();

    if (parser == null) {
      String message = "Can't retrieve Tika parser for mime-type " + mimeType;
      return new ParseStatus(ParseStatus.FAILED, message)
          .getEmptyParseResult(content.getUrl(), getConf());

    LOG.debug("Using Tika parser " + parser.getClass().getName() + " for mime-type " + mimeType);

    Metadata tikamd = new Metadata();

    HTMLDocumentImpl doc = new HTMLDocumentImpl();
    DocumentFragment root = doc.createDocumentFragment();
    DOMBuilder domhandler = new DOMBuilder(doc, root);
    ParseContext context = new ParseContext();
    try {
      parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd, context);
    } catch (Exception e) {
      LOG.error("Error parsing " + content.getUrl(), e);
      return new ParseStatus(ParseStatus.FAILED, e.getMessage())
          .getEmptyParseResult(content.getUrl(), getConf());

    HTMLMetaTags metaTags = new HTMLMetaTags();
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();

    // we have converted the sax events generated by Tika into a DOM object
    // so we can now use the usual HTML resources from Nutch
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());

    // check meta directives
    if (!metaTags.getNoIndex()) { // okay to index
      StringBuffer sb = new StringBuffer();
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting text...");
      utils.getText(sb, root); // extract text
      text = sb.toString();
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting title...");
      utils.getTitle(sb, root); // extract title
      title = sb.toString().trim();

    if (!metaTags.getNoFollow()) { // okay to follow links
      ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
      URL baseTag = utils.getBase(root);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting links...");
      utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
      outlinks = l.toArray(new Outlink[l.size()]);
      if (LOG.isTraceEnabled()) {
        LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());

    // populate Nutch metadata with Tika metadata
    String[] TikaMDNames = tikamd.names();
    for (String tikaMDName : TikaMDNames) {
      if (tikaMDName.equalsIgnoreCase(Metadata.TITLE)) continue;
      // TODO what if multivalued?
      nutchMetadata.add(tikaMDName, tikamd.get(tikaMDName));

    // no outlinks? try OutlinkExtractor e.g works for mime types where no
    // explicit markup for anchors

    if (outlinks.length == 0) {
      outlinks = OutlinkExtractor.getOutlinks(text, getConf());

    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
          new String[] {
            metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime())
    ParseData parseData =
        new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata);
    ParseResult parseResult =
        ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));

    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) { // not okay to cache
      for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
        entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    return filteredParse;