Пример #1
0
 /** Returns true if mediaType falls withing the given range (pattern), false otherwise */
 private boolean isMediaTypeMatch(MediaType mediaType, MediaType rangePattern) {
   String WILDCARD = "*";
   String rangePatternType = rangePattern.getType();
   String rangePatternSubtype = rangePattern.getSubtype();
   return (rangePatternType.equals(WILDCARD) || rangePatternType.equals(mediaType.getType()))
       && (rangePatternSubtype.equals(WILDCARD)
           || rangePatternSubtype.equals(mediaType.getSubtype()));
 }
    @Override
    public void parse(
        InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
      // Is it a supported image?
      String filename = metadata.get(Metadata.RESOURCE_NAME_KEY);
      String type = metadata.get(Metadata.CONTENT_TYPE);
      boolean accept = false;

      if (type != null) {
        for (MediaType mt : types) {
          if (mt.toString().equals(type)) {
            accept = true;
          }
        }
      }
      if (filename != null) {
        for (MediaType mt : types) {
          String ext = "." + mt.getSubtype();
          if (filename.endsWith(ext)) {
            accept = true;
          }
        }
      }

      if (!accept) return;

      handleImage(stream, filename, type);
    }
Пример #3
0
  public static String getMimeTypeFromContentType(String contentType) {
    String result = "";
    MediaType mt = MediaType.parse(contentType);
    if (mt != null) {
      result = mt.getType() + "/" + mt.getSubtype();
    }

    return result;
  }
Пример #4
0
  public static void main(String[] args) throws IOException {

    DemoWebMiningOptions options = new DemoWebMiningOptions();
    CmdLineParser parser = new CmdLineParser(options);

    try {
      parser.parseArgument(args);
    } catch (CmdLineException e) {
      System.err.println(e.getMessage());
      printUsageAndExit(parser);
    }

    // Build and run the flow.

    try {

      Path workingDirPath = new Path(options.getWorkingDir());

      JobConf conf = new JobConf();
      FileSystem fs = workingDirPath.getFileSystem(conf);
      setupWorkingDir(fs, workingDirPath, CrawlConfig.SEED_URLS_FILENAME);

      Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath);
      if (latestDirPath == null) {
        error("No previous cycle output dirs exist in " + workingDirPath, parser);
      }

      Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);

      UserAgent userAgent =
          new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS);

      FetcherPolicy fetcherPolicy = new FetcherPolicy();
      fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY);
      fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE);
      fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT);

      // We only care about mime types that the Tika HTML parser can handle,
      // so restrict it to the same.
      Set<String> validMimeTypes = new HashSet<String>();
      Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext());
      for (MediaType supportedType : supportedTypes) {
        validMimeTypes.add(
            String.format("%s/%s", supportedType.getType(), supportedType.getSubtype()));
      }
      fetcherPolicy.setValidMimeTypes(validMimeTypes);

      // Let's limit our crawl to two loops
      for (int curLoop = 1; curLoop <= 2; curLoop++) {
        Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, curLoop);
        Flow flow =
            DemoWebMiningWorkflow.createWebMiningWorkflow(
                crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options, curLoop == 1);
        flow.complete();

        // Update crawlDbPath to point to the latest crawl db
        crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME);
      }

    } catch (Exception e) {
      System.err.println("Exception running job: " + e.getMessage());
      e.printStackTrace(System.err);
      System.exit(-1);
    }
  }