/** Returns true if mediaType falls withing the given range (pattern), false otherwise */ private boolean isMediaTypeMatch(MediaType mediaType, MediaType rangePattern) { String WILDCARD = "*"; String rangePatternType = rangePattern.getType(); String rangePatternSubtype = rangePattern.getSubtype(); return (rangePatternType.equals(WILDCARD) || rangePatternType.equals(mediaType.getType())) && (rangePatternSubtype.equals(WILDCARD) || rangePatternSubtype.equals(mediaType.getSubtype())); }
@Override public void parse( InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { // Is it a supported image? String filename = metadata.get(Metadata.RESOURCE_NAME_KEY); String type = metadata.get(Metadata.CONTENT_TYPE); boolean accept = false; if (type != null) { for (MediaType mt : types) { if (mt.toString().equals(type)) { accept = true; } } } if (filename != null) { for (MediaType mt : types) { String ext = "." + mt.getSubtype(); if (filename.endsWith(ext)) { accept = true; } } } if (!accept) return; handleImage(stream, filename, type); }
public static String getMimeTypeFromContentType(String contentType) { String result = ""; MediaType mt = MediaType.parse(contentType); if (mt != null) { result = mt.getType() + "/" + mt.getSubtype(); } return result; }
public static void main(String[] args) throws IOException { DemoWebMiningOptions options = new DemoWebMiningOptions(); CmdLineParser parser = new CmdLineParser(options); try { parser.parseArgument(args); } catch (CmdLineException e) { System.err.println(e.getMessage()); printUsageAndExit(parser); } // Build and run the flow. try { Path workingDirPath = new Path(options.getWorkingDir()); JobConf conf = new JobConf(); FileSystem fs = workingDirPath.getFileSystem(conf); setupWorkingDir(fs, workingDirPath, CrawlConfig.SEED_URLS_FILENAME); Path latestDirPath = CrawlDirUtils.findLatestLoopDir(fs, workingDirPath); if (latestDirPath == null) { error("No previous cycle output dirs exist in " + workingDirPath, parser); } Path crawlDbPath = new Path(latestDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); UserAgent userAgent = new UserAgent(options.getAgentName(), CrawlConfig.EMAIL_ADDRESS, CrawlConfig.WEB_ADDRESS); FetcherPolicy fetcherPolicy = new FetcherPolicy(); fetcherPolicy.setCrawlDelay(CrawlConfig.DEFAULT_CRAWL_DELAY); fetcherPolicy.setMaxContentSize(CrawlConfig.MAX_CONTENT_SIZE); fetcherPolicy.setFetcherMode(FetcherMode.EFFICIENT); // We only care about mime types that the Tika HTML parser can handle, // so restrict it to the same. Set<String> validMimeTypes = new HashSet<String>(); Set<MediaType> supportedTypes = new HtmlParser().getSupportedTypes(new ParseContext()); for (MediaType supportedType : supportedTypes) { validMimeTypes.add( String.format("%s/%s", supportedType.getType(), supportedType.getSubtype())); } fetcherPolicy.setValidMimeTypes(validMimeTypes); // Let's limit our crawl to two loops for (int curLoop = 1; curLoop <= 2; curLoop++) { Path curLoopDirPath = CrawlDirUtils.makeLoopDir(fs, workingDirPath, curLoop); Flow flow = DemoWebMiningWorkflow.createWebMiningWorkflow( crawlDbPath, curLoopDirPath, fetcherPolicy, userAgent, options, curLoop == 1); flow.complete(); // Update crawlDbPath to point to the latest crawl db crawlDbPath = new Path(curLoopDirPath, CrawlConfig.CRAWLDB_SUBDIR_NAME); } } catch (Exception e) { System.err.println("Exception running job: " + e.getMessage()); e.printStackTrace(System.err); System.exit(-1); } }