コード例 #1
0
 static {
   SUPPORTED_MIMETYPES = new ArrayList<String>();
   Parser p = new PackageParser();
   for (MediaType mt : p.getSupportedTypes(null)) {
     // Tika can probably do some useful text
     SUPPORTED_MIMETYPES.add(mt.toString());
   }
 }
コード例 #2
0
ファイル: TikaCLI.java プロジェクト: slavianp/webscrap
  private void displayParser(Parser p, boolean includeMimeTypes, int i) {
    boolean isComposite = (p instanceof CompositeParser);
    String name =
        (p instanceof ParserDecorator)
            ? ((ParserDecorator) p).getWrappedParser().getClass().getName()
            : p.getClass().getName();
    System.out.println(indent(i) + name + (isComposite ? " (Composite Parser):" : ""));
    if (includeMimeTypes && !isComposite) {
      for (MediaType mt : p.getSupportedTypes(context)) {
        System.out.println(indent(i + 2) + mt);
      }
    }

    if (isComposite) {
      Parser[] subParsers = sortParsers(invertMediaTypeMap(((CompositeParser) p).getParsers()));
      for (Parser sp : subParsers) {
        displayParser(sp, includeMimeTypes, i + 2);
      }
    }
  }
コード例 #3
0
    public SolrCell(
        CommandBuilder builder,
        Config config,
        Command parent,
        Command child,
        MorphlineContext context) {
      super(builder, config, parent, child, context);

      Config solrLocatorConfig = getConfigs().getConfig(config, "solrLocator");
      SolrLocator locator = new SolrLocator(solrLocatorConfig, context);
      LOG.debug("solrLocator: {}", locator);
      this.schema = locator.getIndexSchema();
      Preconditions.checkNotNull(schema);
      LOG.trace(
          "Solr schema: \n{}", Joiner.on("\n").join(new TreeMap(schema.getFields()).values()));

      ListMultimap<String, String> cellParams = ArrayListMultimap.create();
      String uprefix = getConfigs().getString(config, ExtractingParams.UNKNOWN_FIELD_PREFIX, null);
      if (uprefix != null) {
        cellParams.put(ExtractingParams.UNKNOWN_FIELD_PREFIX, uprefix);
      }
      for (String capture :
          getConfigs()
              .getStringList(
                  config, ExtractingParams.CAPTURE_ELEMENTS, Collections.<String>emptyList())) {
        cellParams.put(ExtractingParams.CAPTURE_ELEMENTS, capture);
      }
      Config fmapConfig = getConfigs().getConfig(config, "fmap", null);
      if (fmapConfig != null) {
        for (Map.Entry<String, Object> entry : new Configs().getEntrySet(fmapConfig)) {
          cellParams.put(ExtractingParams.MAP_PREFIX + entry.getKey(), entry.getValue().toString());
        }
      }
      String captureAttributes =
          getConfigs().getString(config, ExtractingParams.CAPTURE_ATTRIBUTES, null);
      if (captureAttributes != null) {
        cellParams.put(ExtractingParams.CAPTURE_ATTRIBUTES, captureAttributes);
      }
      String lowerNames = getConfigs().getString(config, ExtractingParams.LOWERNAMES, null);
      if (lowerNames != null) {
        cellParams.put(ExtractingParams.LOWERNAMES, lowerNames);
      }
      String defaultField = getConfigs().getString(config, ExtractingParams.DEFAULT_FIELD, null);
      if (defaultField != null) {
        cellParams.put(ExtractingParams.DEFAULT_FIELD, defaultField);
      }
      xpathExpr = getConfigs().getString(config, ExtractingParams.XPATH_EXPRESSION, null);
      if (xpathExpr != null) {
        cellParams.put(ExtractingParams.XPATH_EXPRESSION, xpathExpr);
      }

      this.dateFormats =
          getConfigs()
              .getStringList(config, "dateFormats", new ArrayList<>(DateUtil.DEFAULT_DATE_FORMATS));

      String handlerStr =
          getConfigs()
              .getString(
                  config,
                  "solrContentHandlerFactory",
                  TrimSolrContentHandlerFactory.class.getName());
      Class<? extends SolrContentHandlerFactory> factoryClass;
      try {
        factoryClass = (Class<? extends SolrContentHandlerFactory>) Class.forName(handlerStr);
      } catch (ClassNotFoundException cnfe) {
        throw new MorphlineCompilationException(
            "Could not find class " + handlerStr + " to use for " + "solrContentHandlerFactory",
            config,
            cnfe);
      }
      this.solrContentHandlerFactory =
          getSolrContentHandlerFactory(factoryClass, dateFormats, config);

      this.locale = getLocale(getConfigs().getString(config, "locale", ""));

      this.mediaTypeToParserMap = new HashMap<>();
      // MimeTypes mimeTypes = MimeTypes.getDefaultMimeTypes(); // FIXME
      // getMediaTypeRegistry.normalize()

      List<? extends Config> parserConfigs = getConfigs().getConfigList(config, "parsers");
      for (Config parserConfig : parserConfigs) {
        String parserClassName = getConfigs().getString(parserConfig, "parser");

        Object obj;
        try {
          obj = Class.forName(parserClassName).newInstance();
        } catch (Throwable e) {
          throw new MorphlineCompilationException(
              "Cannot instantiate Tika parser: " + parserClassName, config, e);
        }
        if (!(obj instanceof Parser)) {
          throw new MorphlineCompilationException(
              "Tika parser "
                  + obj.getClass().getName()
                  + " must be an instance of class "
                  + Parser.class.getName(),
              config);
        }
        Parser parser = (Parser) obj;
        this.parsers.add(parser);

        List<String> mediaTypes =
            getConfigs()
                .getStringList(parserConfig, SUPPORTED_MIME_TYPES, Collections.<String>emptyList());
        for (String mediaTypeStr : mediaTypes) {
          MediaType mediaType = parseMediaType(mediaTypeStr);
          addSupportedMimeType(mediaTypeStr);
          this.mediaTypeToParserMap.put(mediaType, parser);
        }

        if (!parserConfig.hasPath(SUPPORTED_MIME_TYPES)) {
          for (MediaType mediaType : parser.getSupportedTypes(new ParseContext())) {
            mediaType = mediaType.getBaseType();
            addSupportedMimeType(mediaType.toString());
            this.mediaTypeToParserMap.put(mediaType, parser);
          }
          List<String> extras =
              getConfigs()
                  .getStringList(
                      parserConfig,
                      ADDITIONAL_SUPPORTED_MIME_TYPES,
                      Collections.<String>emptyList());
          for (String mediaTypeStr : extras) {
            MediaType mediaType = parseMediaType(mediaTypeStr);
            addSupportedMimeType(mediaTypeStr);
            this.mediaTypeToParserMap.put(mediaType, parser);
          }
        }
      }
      // LOG.info("mediaTypeToParserMap="+mediaTypeToParserMap);

      Map<String, String[]> tmp = new HashMap();
      for (Map.Entry<String, Collection<String>> entry : cellParams.asMap().entrySet()) {
        tmp.put(entry.getKey(), entry.getValue().toArray(new String[entry.getValue().size()]));
      }
      this.solrParams = new MultiMapSolrParams(tmp);
      validateArguments();
    }