예제 #1
0
  public static String getMimeTypeFromContentType(String contentType) {
    String result = "";
    MediaType mt = MediaType.parse(contentType);
    if (mt != null) {
      result = mt.getType() + "/" + mt.getSubtype();
    }

    return result;
  }
예제 #2
0
  public static String getCharsetFromContentType(String contentType) {
    String result = "";
    MediaType mt = MediaType.parse(contentType);
    if (mt != null) {
      String charset = mt.getParameters().get("charset");
      if (charset != null) {
        result = charset;
      }
    }

    return result;
  }
예제 #3
0
  /**
   * Performs a one time intialization of Tika's Media-Type components and media type collection
   * constants <br>
   * Please note that this is a private static method which is called once per CLASS (not per
   * instance / object)
   */
  private static void initMediaTypes() {
    /* XML media types (and all aliases) */
    XML_MEDIA_TYPES.add(APPLICATION_XML);
    XML_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(APPLICATION_XML));

    /* TEXT media types (and all aliases) */
    TEXT_MEDIA_TYPES.add(TEXT_PLAIN);
    TEXT_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(TEXT_PLAIN));

    /* GZIP media types (and all aliases) */
    MediaType gzipMediaType = MediaType.parse("application/gzip");
    GZ_MEDIA_TYPES.add(gzipMediaType);
    GZ_MEDIA_TYPES.addAll(MEDIA_TYPE_REGISTRY.getAliases(gzipMediaType));
  }
예제 #4
0
  protected Parser getParser(Metadata metadata, ParseContext context) {
    Map<MediaType, Parser> map = getParsers(context);
    MediaType type = MediaType.parse(metadata.get(Metadata.CONTENT_TYPE));
    if (type != null) {
      // We always work on the normalised, canonical form
      type = registry.normalize(type);
    }
    while (type != null) {
      // Try finding a parser for the type
      Parser parser = map.get(type);
      if (parser != null) {
        return parser;
      }

      // Failing that, try for the parent of the type
      type = registry.getSupertype(type);
    }
    return fallback;
  }
  /*
   * (non-Javadoc)
   * @see org.alfresco.repo.rendition.executer.AbstractRenderingEngine#render(org.alfresco.repo.rendition.executer.AbstractRenderingEngine.RenderingContext)
   */
  @Override
  protected void render(RenderingContext context) {
    ContentReader contentReader = context.makeContentReader();
    String sourceMimeType = contentReader.getMimetype();

    // Check that Tika supports the supplied file
    AutoDetectParser p = new AutoDetectParser(tikaConfig);
    MediaType sourceMediaType = MediaType.parse(sourceMimeType);
    if (!p.getParsers().containsKey(sourceMediaType)) {
      throw new RenditionServiceException(
          "Source mime type of "
              + sourceMimeType
              + " is not supported by Tika for HTML conversions");
    }

    // Make the HTML Version using Tika
    // This will also extract out any images as found
    generateHTML(p, context);
  }
예제 #6
0
  /** @return SiteMap/SiteMapIndex given a content type, byte content and the URL of a sitemap */
  public AbstractSiteMap parseSiteMap(String contentType, byte[] content, URL url)
      throws UnknownFormatException, IOException {
    MediaType mediaType = MediaType.parse(contentType);

    // Octet-stream is the father of all binary types
    while (mediaType != null && !mediaType.equals(MediaType.OCTET_STREAM)) {
      if (XML_MEDIA_TYPES.contains(mediaType)) {
        return processXml(url, content);
      } else if (TEXT_MEDIA_TYPES.contains(mediaType)) {
        return (AbstractSiteMap) processText(url.toString(), content);
      } else if (GZ_MEDIA_TYPES.contains(mediaType)) {
        return processGzip(url, content);
      } else {
        mediaType = MEDIA_TYPE_REGISTRY.getSupertype(mediaType); // Check
        // parent
        return parseSiteMap(mediaType.toString(), content, url);
      }
    }

    throw new UnknownFormatException(
        "Can't parse a sitemap with the MediaType of: " + contentType + " (at: " + url + ")");
  }
예제 #7
0
 private static Set<MediaType> mediaTypesListFromDomElement(Element node, String tag)
     throws TikaException, IOException {
   Set<MediaType> types = null;
   NodeList children = node.getChildNodes();
   for (int i = 0; i < children.getLength(); i++) {
     Node cNode = children.item(i);
     if (cNode instanceof Element) {
       Element cElement = (Element) cNode;
       if (tag.equals(cElement.getTagName())) {
         String mime = getText(cElement);
         MediaType type = MediaType.parse(mime);
         if (type != null) {
           if (types == null) types = new HashSet<>();
           types.add(type);
         } else {
           throw new TikaException("Invalid media type name: " + mime);
         }
       }
     }
   }
   if (types != null) return types;
   return Collections.emptySet();
 }
  @Override
  public void load(
      SolrQueryRequest req,
      SolrQueryResponse rsp,
      ContentStream stream,
      UpdateRequestProcessor processor)
      throws Exception {
    Parser parser = null;
    String streamType = req.getParams().get(ExtractingParams.STREAM_TYPE, null);
    if (streamType != null) {
      // Cache?  Parsers are lightweight to construct and thread-safe, so I'm told
      MediaType mt = MediaType.parse(streamType.trim().toLowerCase(Locale.ROOT));
      parser = new DefaultParser(config.getMediaTypeRegistry()).getParsers().get(mt);
    } else {
      parser = autoDetectParser;
    }
    if (parser != null) {
      Metadata metadata = new Metadata();

      // If you specify the resource name (the filename, roughly) with this parameter,
      // then Tika can make use of it in guessing the appropriate MIME type:
      String resourceName = req.getParams().get(ExtractingParams.RESOURCE_NAME, null);
      if (resourceName != null) {
        metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, resourceName);
      }
      // Provide stream's content type as hint for auto detection
      if (stream.getContentType() != null) {
        metadata.add(HttpHeaders.CONTENT_TYPE, stream.getContentType());
      }

      InputStream inputStream = null;
      try {
        inputStream = stream.getStream();
        metadata.add(ExtractingMetadataConstants.STREAM_NAME, stream.getName());
        metadata.add(ExtractingMetadataConstants.STREAM_SOURCE_INFO, stream.getSourceInfo());
        metadata.add(ExtractingMetadataConstants.STREAM_SIZE, String.valueOf(stream.getSize()));
        metadata.add(ExtractingMetadataConstants.STREAM_CONTENT_TYPE, stream.getContentType());
        // HtmlParser and TXTParser regard Metadata.CONTENT_ENCODING in metadata
        String charset = ContentStreamBase.getCharsetFromContentType(stream.getContentType());
        if (charset != null) {
          metadata.add(HttpHeaders.CONTENT_ENCODING, charset);
        }

        String xpathExpr = params.get(ExtractingParams.XPATH_EXPRESSION);
        boolean extractOnly = params.getBool(ExtractingParams.EXTRACT_ONLY, false);
        SolrContentHandler handler =
            factory.createSolrContentHandler(metadata, params, req.getSchema());
        ContentHandler parsingHandler = handler;

        StringWriter writer = null;
        BaseMarkupSerializer serializer = null;
        if (extractOnly == true) {
          String extractFormat = params.get(ExtractingParams.EXTRACT_FORMAT, "xml");
          writer = new StringWriter();
          if (extractFormat.equals(TEXT_FORMAT)) {
            serializer = new TextSerializer();
            serializer.setOutputCharStream(writer);
            serializer.setOutputFormat(new OutputFormat("Text", "UTF-8", true));
          } else {
            serializer = new XMLSerializer(writer, new OutputFormat("XML", "UTF-8", true));
          }
          if (xpathExpr != null) {
            Matcher matcher = PARSER.parse(xpathExpr);
            serializer
                .startDocument(); // The MatchingContentHandler does not invoke startDocument.  See
                                  // http://tika.markmail.org/message/kknu3hw7argwiqin
            parsingHandler = new MatchingContentHandler(serializer, matcher);
          } else {
            parsingHandler = serializer;
          }
        } else if (xpathExpr != null) {
          Matcher matcher = PARSER.parse(xpathExpr);
          parsingHandler = new MatchingContentHandler(handler, matcher);
        } // else leave it as is

        try {
          // potentially use a wrapper handler for parsing, but we still need the SolrContentHandler
          // for getting the document.
          ParseContext context = parseContextConfig.create();

          context.set(Parser.class, parser);
          context.set(HtmlMapper.class, MostlyPassthroughHtmlMapper.INSTANCE);

          // Password handling
          RegexRulesPasswordProvider epp = new RegexRulesPasswordProvider();
          String pwMapFile = params.get(ExtractingParams.PASSWORD_MAP_FILE);
          if (pwMapFile != null && pwMapFile.length() > 0) {
            InputStream is = req.getCore().getResourceLoader().openResource(pwMapFile);
            if (is != null) {
              log.debug("Password file supplied: " + pwMapFile);
              epp.parse(is);
            }
          }
          context.set(PasswordProvider.class, epp);
          String resourcePassword = params.get(ExtractingParams.RESOURCE_PASSWORD);
          if (resourcePassword != null) {
            epp.setExplicitPassword(resourcePassword);
            log.debug("Literal password supplied for file " + resourceName);
          }
          parser.parse(inputStream, parsingHandler, metadata, context);
        } catch (TikaException e) {
          if (ignoreTikaException)
            log.warn(
                new StringBuilder("skip extracting text due to ")
                    .append(e.getLocalizedMessage())
                    .append(". metadata=")
                    .append(metadata.toString())
                    .toString());
          else throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
        }
        if (extractOnly == false) {
          addDoc(handler);
        } else {
          // serializer is not null, so we need to call endDoc on it if using xpath
          if (xpathExpr != null) {
            serializer.endDocument();
          }
          rsp.add(stream.getName(), writer.toString());
          writer.close();
          String[] names = metadata.names();
          NamedList metadataNL = new NamedList();
          for (int i = 0; i < names.length; i++) {
            String[] vals = metadata.getValues(names[i]);
            metadataNL.add(names[i], vals);
          }
          rsp.add(stream.getName() + "_metadata", metadataNL);
        }
      } catch (SAXException e) {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
      } finally {
        IOUtils.closeQuietly(inputStream);
      }
    } else {
      throw new SolrException(
          SolrException.ErrorCode.BAD_REQUEST,
          "Stream type of "
              + streamType
              + " didn't match any known parsers.  Please supply the "
              + ExtractingParams.STREAM_TYPE
              + " parameter.");
    }
  }
예제 #9
0
  public ParseResult getParse(Content content) {
    String mimeType = content.getContentType();

    URL base;
    try {
      base = new URL(content.getBaseUrl());
    } catch (MalformedURLException e) {
      return new ParseStatus(e).getEmptyParseResult(content.getUrl(), getConf());
    }

    // get the right parser using the mime type as a clue
    Parser parser = tikaConfig.getParser(MediaType.parse(mimeType));
    byte[] raw = content.getContent();

    if (parser == null) {
      String message = "Can't retrieve Tika parser for mime-type " + mimeType;
      LOG.error(message);
      return new ParseStatus(ParseStatus.FAILED, message)
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    LOG.debug("Using Tika parser " + parser.getClass().getName() + " for mime-type " + mimeType);

    Metadata tikamd = new Metadata();

    HTMLDocumentImpl doc = new HTMLDocumentImpl();
    doc.setErrorChecking(false);
    DocumentFragment root = doc.createDocumentFragment();
    DOMBuilder domhandler = new DOMBuilder(doc, root);
    ParseContext context = new ParseContext();
    try {
      parser.parse(new ByteArrayInputStream(raw), domhandler, tikamd, context);
    } catch (Exception e) {
      LOG.error("Error parsing " + content.getUrl(), e);
      return new ParseStatus(ParseStatus.FAILED, e.getMessage())
          .getEmptyParseResult(content.getUrl(), getConf());
    }

    HTMLMetaTags metaTags = new HTMLMetaTags();
    String text = "";
    String title = "";
    Outlink[] outlinks = new Outlink[0];
    org.apache.nutch.metadata.Metadata nutchMetadata = new org.apache.nutch.metadata.Metadata();

    // we have converted the sax events generated by Tika into a DOM object
    // so we can now use the usual HTML resources from Nutch
    // get meta directives
    HTMLMetaProcessor.getMetaTags(metaTags, root, base);
    if (LOG.isTraceEnabled()) {
      LOG.trace("Meta tags for " + base + ": " + metaTags.toString());
    }

    // check meta directives
    if (!metaTags.getNoIndex()) { // okay to index
      StringBuffer sb = new StringBuffer();
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting text...");
      }
      utils.getText(sb, root); // extract text
      text = sb.toString();
      sb.setLength(0);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting title...");
      }
      utils.getTitle(sb, root); // extract title
      title = sb.toString().trim();
    }

    if (!metaTags.getNoFollow()) { // okay to follow links
      ArrayList<Outlink> l = new ArrayList<Outlink>(); // extract outlinks
      URL baseTag = utils.getBase(root);
      if (LOG.isTraceEnabled()) {
        LOG.trace("Getting links...");
      }
      utils.getOutlinks(baseTag != null ? baseTag : base, l, root);
      outlinks = l.toArray(new Outlink[l.size()]);
      if (LOG.isTraceEnabled()) {
        LOG.trace("found " + outlinks.length + " outlinks in " + content.getUrl());
      }
    }

    // populate Nutch metadata with Tika metadata
    String[] TikaMDNames = tikamd.names();
    for (String tikaMDName : TikaMDNames) {
      if (tikaMDName.equalsIgnoreCase(Metadata.TITLE)) continue;
      // TODO what if multivalued?
      nutchMetadata.add(tikaMDName, tikamd.get(tikaMDName));
    }

    // no outlinks? try OutlinkExtractor e.g works for mime types where no
    // explicit markup for anchors

    if (outlinks.length == 0) {
      outlinks = OutlinkExtractor.getOutlinks(text, getConf());
    }

    ParseStatus status = new ParseStatus(ParseStatus.SUCCESS);
    if (metaTags.getRefresh()) {
      status.setMinorCode(ParseStatus.SUCCESS_REDIRECT);
      status.setArgs(
          new String[] {
            metaTags.getRefreshHref().toString(), Integer.toString(metaTags.getRefreshTime())
          });
    }
    ParseData parseData =
        new ParseData(status, title, outlinks, content.getMetadata(), nutchMetadata);
    ParseResult parseResult =
        ParseResult.createParseResult(content.getUrl(), new ParseImpl(text, parseData));

    // run filters on parse
    ParseResult filteredParse = this.htmlParseFilters.filter(content, parseResult, metaTags, root);
    if (metaTags.getNoCache()) { // not okay to cache
      for (Map.Entry<org.apache.hadoop.io.Text, Parse> entry : filteredParse)
        entry.getValue().getData().getParseMeta().set(Nutch.CACHING_FORBIDDEN_KEY, cachingPolicy);
    }
    return filteredParse;
  }
예제 #10
0
 private MediaType parseMediaType(String mediaTypeStr) {
   MediaType mediaType = MediaType.parse(mediaTypeStr.trim().toLowerCase(Locale.ROOT));
   return mediaType.getBaseType();
 };