public void inject(Path crawlDb, Path urlDir) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); if (LOG.isInfoEnabled()) { LOG.info("Injector: starting at " + sdf.format(start)); LOG.info("Injector: crawlDb: " + crawlDb); LOG.info("Injector: urlDir: " + urlDir); } Path tempDir = new Path( getConf().get("mapred.temp.dir", ".") + "/inject-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // map text input file to a <url,CrawlDatum> file if (LOG.isInfoEnabled()) { LOG.info("Injector: Converting injected urls to crawl db entries."); } JobConf sortJob = new NutchJob(getConf()); sortJob.setJobName("inject " + urlDir); FileInputFormat.addInputPath(sortJob, urlDir); sortJob.setMapperClass(InjectMapper.class); FileOutputFormat.setOutputPath(sortJob, tempDir); sortJob.setOutputFormat(SequenceFileOutputFormat.class); sortJob.setOutputKeyClass(Text.class); sortJob.setOutputValueClass(CrawlDatum.class); sortJob.setLong("injector.current.time", System.currentTimeMillis()); RunningJob mapJob = JobClient.runJob(sortJob); long urlsInjected = mapJob.getCounters().findCounter("injector", "urls_injected").getValue(); long urlsFiltered = mapJob.getCounters().findCounter("injector", "urls_filtered").getValue(); LOG.info("Injector: total number of urls rejected by filters: " + urlsFiltered); LOG.info( "Injector: total number of urls injected after normalization and filtering: " + urlsInjected); // merge with existing crawl db if (LOG.isInfoEnabled()) { LOG.info("Injector: Merging injected urls into crawl db."); } JobConf mergeJob = CrawlDb.createJob(getConf(), crawlDb); FileInputFormat.addInputPath(mergeJob, tempDir); mergeJob.setReducerClass(InjectReducer.class); JobClient.runJob(mergeJob); CrawlDb.install(mergeJob, crawlDb); // clean up FileSystem fs = FileSystem.get(getConf()); fs.delete(tempDir, true); long end = System.currentTimeMillis(); LOG.info( "Injector: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); }
// the method which actually copies the caches locally and unjars/unzips them // and does chmod for the files private static Path localizeCache( Configuration conf, URI cache, long confFileStamp, CacheStatus cacheStatus, boolean isArchive) throws IOException { FileSystem fs = getFileSystem(cache, conf); FileSystem localFs = FileSystem.getLocal(conf); Path parchive = null; if (isArchive) { parchive = new Path( cacheStatus.localizedLoadPath, new Path(cacheStatus.localizedLoadPath.getName())); } else { parchive = cacheStatus.localizedLoadPath; } if (!localFs.mkdirs(parchive.getParent())) { throw new IOException( "Mkdirs failed to create directory " + cacheStatus.localizedLoadPath.toString()); } String cacheId = cache.getPath(); fs.copyToLocalFile(new Path(cacheId), parchive); if (isArchive) { String tmpArchive = parchive.toString().toLowerCase(); File srcFile = new File(parchive.toString()); File destDir = new File(parchive.getParent().toString()); if (tmpArchive.endsWith(".jar")) { RunJar.unJar(srcFile, destDir); } else if (tmpArchive.endsWith(".zip")) { FileUtil.unZip(srcFile, destDir); } else if (isTarFile(tmpArchive)) { FileUtil.unTar(srcFile, destDir); } // else will not do anyhting // and copy the file into the dir as it is } long cacheSize = FileUtil.getDU(new File(parchive.getParent().toString())); cacheStatus.size = cacheSize; addCacheInfoUpdate(cacheStatus); // do chmod here try { // Setting recursive permission to grant everyone read and execute Path localDir = new Path(cacheStatus.localizedBaseDir, cacheStatus.uniqueParentDir); LOG.info("Doing chmod on localdir :" + localDir); FileUtil.chmod(localDir.toString(), "ugo+rx", true); } catch (InterruptedException e) { LOG.warn("Exception in chmod" + e.toString()); } // update cacheStatus to reflect the newly cached file cacheStatus.mtime = getTimestamp(conf, cache); return cacheStatus.localizedLoadPath; }
protected void setup(Context context) throws IOException, InterruptedException { super.setup(context); Path pt = new Path("/user/yao/query/query"); FileSystem fs = FileSystem.get(new Configuration()); BufferedReader br = new BufferedReader(new InputStreamReader(fs.open(pt))); String line = br.readLine(); String[] keywords = line.split(","); k0 = keywords[0]; k1 = keywords[1]; k2 = keywords[2]; br.close(); }
private int getNumSecrets(Path secretsPath) throws Exception { FileSystem fileSystem = FileSystem.get(secretsPath.toUri(), getConf()); FSDataInputStream inputStream = fileSystem.open(secretsPath); BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream)); String currentLine; int numLines = 0; while ((currentLine = reader.readLine()) != null) { if (!currentLine.isEmpty()) { numLines++; } } reader.close(); return numLines; }
private void generateSaltIfNeeded(Path saltFilePath, Path secretsPath) throws Exception { FileSystem fileSystem = FileSystem.get(saltFilePath.toUri(), getConf()); if (!fileSystem.exists(saltFilePath)) { FSDataOutputStream outputStream = fileSystem.create(saltFilePath); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(outputStream)); int numSaltsToGenerate = getNumSecrets(secretsPath); System.out.printf("Generating %d salts\n", numSaltsToGenerate); for (int i = 0; i < numSaltsToGenerate; i++) { writer.write(BCrypt.gensalt()); writer.newLine(); } writer.close(); outputStream.close(); } }
private static boolean hasFile( dbutil db_util, FileSystem fs, String db_name, String file_id, String file_path) throws Exception { Get file_id_get = new Get(file_id.getBytes()); Result file_result = db_util.doGet(db_name, file_id_get); KeyValue file_names = file_result.getColumnLatest("d".getBytes(), "filenames".getBytes()); if (file_names == null) { return false; } String all_files = new String(file_names.getValue()); String[] files = all_files.split("\n"); for (String line : files) { if (line.equals(file_path)) { if (fs.globStatus(new Path(line + "*")).length == 0) { Put new_put = new Put(file_id.getBytes()); new_put.add( "d".getBytes(), "filenames".getBytes(), all_files.replace(file_path + "\n", "").getBytes()); db_util.doPut(db_name, new_put); return false; } return true; } } return false; }
/** * Clear the entire contents of the cache and delete the backing files. This should only be used * when the server is reinitializing, because the users are going to lose their files. */ public static void purgeCache(Configuration conf, MRAsyncDiskService service) throws IOException { synchronized (cachedArchives) { LocalFileSystem localFs = FileSystem.getLocal(conf); for (Map.Entry<String, CacheStatus> f : cachedArchives.entrySet()) { try { deleteLocalPath(service, localFs, f.getValue().localizedLoadPath); } catch (IOException ie) { LOG.debug("Error cleaning up cache", ie); } } cachedArchives.clear(); } }
public static void recursePath(Configuration conf, Path path, Job job) { try { FileSystem fs = path.getFileSystem(conf); FileStatus[] fstats = fs.listStatus(path); if (fstats != null) { for (FileStatus f : fstats) { Path p = f.getPath(); ; if (fs.isFile(p)) { // connection times out otherwise System.err.println("file:" + p.toString()); FileInputFormat.addInputPath(job, p); } else { System.err.println("dir:" + p.toString()); recursePath(conf, p, job); } } } } catch (IOException e) { // shouldn't be here throw new RuntimeException(e); } }
private Job configureJob(Path secretsPath, Path saltFilePath, Path inputPath, Path outputPath) throws Exception { Job job = Job.getInstance(getConf()); FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); job.getConfiguration().set(ObfuscateMapper.SECRET_WORDS_FILE_KEY, secretsPath.toString()); job.getConfiguration().set(ObfuscateMapper.SALT_FILE_KEY, saltFilePath.toString()); job.setInputFormatClass(TextInputFormat.class); job.setMapperClass(ObfuscateMapper.class); job.setNumReduceTasks(0); job.setJarByClass(getClass()); FileSystem.get(outputPath.toUri(), getConf()).delete(outputPath, true); return job; }
/* * Returns the relative path of the dir this cache will be localized in * relative path that this cache will be localized in. For * hdfs://hostname:port/absolute_path -- the relative path is * hostname/absolute path -- if it is just /absolute_path -- then the * relative path is hostname of DFS this mapred cluster is running * on/absolute_path */ public static String makeRelative(URI cache, Configuration conf) throws IOException { String host = cache.getHost(); if (host == null) { host = cache.getScheme(); } if (host == null) { URI defaultUri = FileSystem.get(conf).getUri(); host = defaultUri.getHost(); if (host == null) { host = defaultUri.getScheme(); } } String path = host + cache.getPath(); path = path.replace(":/", "/"); // remove windows device colon return path; }
private static Path checkCacheStatusValidity( Configuration conf, URI cache, long confFileStamp, CacheStatus cacheStatus, FileStatus fileStatus, boolean isArchive) throws IOException { FileSystem fs = FileSystem.get(cache, conf); // Has to be if (!ifExistsAndFresh(conf, fs, cache, confFileStamp, cacheStatus, fileStatus)) { throw new IOException( "Stale cache file: " + cacheStatus.localizedLoadPath + " for cache-file: " + cache); } LOG.info( String.format( "Using existing cache of %s->%s", cache.toString(), cacheStatus.localizedLoadPath)); return cacheStatus.localizedLoadPath; }
private static void deleteCache(Configuration conf, MRAsyncDiskService asyncDiskService) throws IOException { List<CacheStatus> deleteSet = new LinkedList<CacheStatus>(); // try deleting cache Status with refcount of zero synchronized (cachedArchives) { for (Iterator<String> it = cachedArchives.keySet().iterator(); it.hasNext(); ) { String cacheId = (String) it.next(); CacheStatus lcacheStatus = cachedArchives.get(cacheId); if (lcacheStatus.refcount == 0) { // delete this cache entry from the global list // and mark the localized file for deletion deleteSet.add(lcacheStatus); it.remove(); } } } // do the deletion asynchronously, after releasing the global lock Thread cacheFileCleaner = new Thread(new CacheFileCleanTask(asyncDiskService, FileSystem.getLocal(conf), deleteSet)); cacheFileCleaner.start(); }
private static FileSystem getFileSystem(URI cache, Configuration conf) throws IOException { String fileSysName = getFileSysName(cache); if (fileSysName != null) return FileSystem.getNamed(fileSysName, conf); else return FileSystem.get(conf); }
/** * Returns the status of a given cache file on hdfs. * * @param conf configuration * @param cache cache file * @return FileStatus object of the file * @throws IOException */ public static FileStatus getFileStatus(Configuration conf, URI cache) throws IOException { FileSystem fileSystem = FileSystem.get(cache, conf); Path filePath = new Path(cache.getPath()); return fileSystem.getFileStatus(filePath); }
/** * Returns mtime of a given cache file on hdfs. * * @param conf configuration * @param cache cache file * @return mtime of a given cache file on hdfs * @throws IOException */ public static long getTimestamp(Configuration conf, URI cache) throws IOException { FileSystem fileSystem = FileSystem.get(cache, conf); Path filePath = new Path(cache.getPath()); return fileSystem.getFileStatus(filePath).getModificationTime(); }
// Information needed to get a single file: // BASE_PATH, FILE_ID, TIMESTAMP_START, TIMESTAMP_STOP, SOURCE, FILESYSTEM private static Vector<Path> getFile( FileSystem fs, Hashtable<String, String> config, dbutil db_util) throws Exception { Long latestVersion = latestVersion(config, db_util); try { config.put("timestamp_start", config.get("timestamp_start")); config.put("timestamp_real", latestVersion.toString()); config.put("timestamp_stop", latestVersion.toString()); } catch (Exception E) { logger.error("Tryign to get file that is impossible to generate: " + getFullPath(config)); return null; } if (Integer.parseInt(config.get("timestamp_start")) > Integer.parseInt(config.get("timestamp_stop"))) { return null; } logger.debug( "Getting DB for timestamp " + config.get("timestamp_start") + " to " + config.get("timestamp_stop")); String final_result = getFullPath(config); String temp_path_base = config.get("local_temp_path") + "_" + config.get("task_id") + "_" + config.get("run_id") + "/"; Path newPath = new Path(final_result + "*"); Vector<Path> ret_path = new Vector<Path>(); String lockName = lock(final_result.replaceAll("/", "_")); if (fs.globStatus(newPath).length != 0) { ret_path.add(newPath); unlock(lockName); config.put("full_file_name", final_result); return ret_path; } else { if (!config.get("source").equals("local")) { config.put("temp_path_base", temp_path_base); config.put("timestamp_start", config.get("timestamp_start")); config.put("timestamp_real", latestVersion.toString()); config.put("timestamp_stop", latestVersion.toString()); Class<?> sourceClass = Class.forName("org.gestore.plugin.source." + config.get("source") + "Source"); Method process_data = sourceClass.getMethod("process", Hashtable.class, FileSystem.class); Object processor = sourceClass.newInstance(); Object retVal; try { retVal = process_data.invoke(processor, config, fs); } catch (InvocationTargetException E) { Throwable exception = E.getTargetException(); logger.error("Unable to call method in child class: " + exception.toString()); exception.printStackTrace(System.out); unlock(lockName); return null; } FileStatus[] files = (FileStatus[]) retVal; if (files == null) { logger.error("Error getting files, no files returned"); return null; } for (FileStatus file : files) { Path cur_file = file.getPath(); Path cur_local_path = new Path(temp_path_base + config.get("file_id")); String suffix = getSuffix(config.get("file_id"), cur_file.getName()); cur_local_path = cur_local_path.suffix(suffix); Path res_path = new Path(new String(final_result + suffix)); logger.debug("Moving file" + cur_file.toString() + " to " + res_path.toString()); if (config.get("copy").equals("true")) { fs.moveFromLocalFile(cur_file, res_path); } else { fs.rename(cur_file, res_path); } } config.put("full_file_name", final_result); } } unlock(lockName); return ret_path; }
public int run(String[] args) throws Exception { // printUsage(); /* * SETUP */ Configuration argConf = getConf(); Hashtable<String, String> confArg = new Hashtable<String, String>(); setup(confArg, argConf); Date currentTime = new Date(); Date endDate = new Date(new Long(confArg.get("timestamp_stop"))); Boolean full_run = confArg.get("intermediate").matches("(?i).*true.*"); Boolean quick_add = confArg.get("quick_add").matches("(?i).*true.*"); logger.info("Running GeStore"); // ZooKeeper setup Configuration config = HBaseConfiguration.create(); zkWatcher = new ZooKeeperWatcher(config, "Testing", new HBaseAdmin(config)); zkInstance = new ZooKeeper( ZKConfig.getZKQuorumServersString(config), config.getInt("zookeeper.session.timeout", -1), zkWatcher); if (!confArg.get("task_id").isEmpty()) { confArg.put("temp_path", confArg.get("temp_path") + confArg.get("task_id")); } String lockRequest = confArg.get("file_id"); if (!confArg.get("run_id").isEmpty()) lockRequest = lockRequest + "_" + confArg.get("run_id") + "_"; if (!confArg.get("task_id").isEmpty()) lockRequest = lockRequest + "_" + confArg.get("task_id") + "_"; // Get type of movement toFrom type_move = checkArgs(confArg); if (type_move == toFrom.LOCAL2REMOTE && !confArg.get("format").equals("unknown")) { List<String> arguments = new ArrayList<String>(); arguments.add("-Dinput=" + confArg.get("local_path")); arguments.add("-Dtable=" + confArg.get("file_id")); arguments.add("-Dtimestamp=" + confArg.get("timestamp_stop")); arguments.add("-Dtype=" + confArg.get("format")); arguments.add("-Dtarget_dir=" + confArg.get("base_path") + "_" + confArg.get("file_id")); arguments.add("-Dtemp_hdfs_path=" + confArg.get("temp_path")); arguments.add("-Drun_id=" + confArg.get("run_id")); if (!confArg.get("run_id").isEmpty()) arguments.add("-Drun_id=" + confArg.get("run_id")); if (!confArg.get("task_id").isEmpty()) arguments.add("-Dtask_id=" + confArg.get("task_id")); if (quick_add) arguments.add("-Dquick_add=" + confArg.get("quick_add")); String lockName = lock(lockRequest); String[] argumentString = arguments.toArray(new String[arguments.size()]); adddb.main(argumentString); unlock(lockName); System.exit(0); } // Database registration dbutil db_util = new dbutil(config); db_util.register_database(confArg.get("db_name_files"), true); db_util.register_database(confArg.get("db_name_runs"), true); db_util.register_database(confArg.get("db_name_updates"), true); FileSystem hdfs = FileSystem.get(config); FileSystem localFS = FileSystem.getLocal(config); // Get source type confArg.put("source", getSource(db_util, confArg.get("db_name_files"), confArg.get("file_id"))); confArg.put( "database", isDatabase(db_util, confArg.get("db_name_files"), confArg.get("file_id"))); if (!confArg.get("source").equals("local") && type_move == toFrom.REMOTE2LOCAL && !confArg.get("timestamp_stop").equals(Integer.toString(Integer.MAX_VALUE))) { confArg.put("timestamp_stop", Long.toString(latestVersion(confArg, db_util))); } /* * Get previous timestamp */ Get run_id_get = new Get(confArg.get("run_id").getBytes()); Result run_get = db_util.doGet(confArg.get("db_name_runs"), run_id_get); KeyValue run_file_prev = run_get.getColumnLatest( "d".getBytes(), (confArg.get("file_id") + "_db_timestamp").getBytes()); String last_timestamp = new String("0"); if (null != run_file_prev && !confArg.get("source").equals("local")) { long last_timestamp_real = run_file_prev.getTimestamp(); Long current_timestamp = new Long(confArg.get("timestamp_real")); if ((current_timestamp - last_timestamp_real) > 36000) { last_timestamp = new String(run_file_prev.getValue()); Integer lastTimestamp = new Integer(last_timestamp); lastTimestamp += 1; last_timestamp = lastTimestamp.toString(); logger.info("Last timestamp: " + last_timestamp + " End data: " + endDate); Date last_run = new Date(run_file_prev.getTimestamp()); if (last_run.before(endDate) && !full_run) { confArg.put("timestamp_start", last_timestamp); } } } Integer tse = new Integer(confArg.get("timestamp_stop")); Integer tss = new Integer(confArg.get("timestamp_start")); if (tss > tse) { logger.info("No new version of requested file."); return 0; } /* * Generate file */ String lockName = lock(lockRequest); Get file_id_get = new Get(confArg.get("file_id").getBytes()); Result file_get = db_util.doGet(confArg.get("db_name_files"), file_id_get); if (!file_get.isEmpty()) { boolean found = hasFile( db_util, hdfs, confArg.get("db_name_files"), confArg.get("file_id"), getFullPath(confArg)); if (confArg.get("source").equals("fullfile")) { found = false; } String filenames_put = getFileNames( db_util, confArg.get("db_name_files"), confArg.get("file_id"), getFullPath(confArg)); // Filename not found in file database if (!found && type_move == toFrom.REMOTE2LOCAL) { if (!confArg.get("source").equals("local")) { // Generate intermediate file if (getFile(hdfs, confArg, db_util) == null) { unlock(lockName); return 1; } // Put generated file into file database if (!confArg.get("format").equals("fullfile")) { putFileEntry( db_util, hdfs, confArg.get("db_name_files"), confArg.get("file_id"), confArg.get("full_file_name"), confArg.get("source")); } } else { logger.warn("Remote file not found, and cannot be generated! File: " + confArg); unlock(lockName); return 1; } } } else { if (type_move == toFrom.REMOTE2LOCAL) { logger.warn("Remote file not found, and cannot be generated."); unlock(lockName); return 1; } } /* * Copy file * Update tables */ if (type_move == toFrom.LOCAL2REMOTE) { if (!confArg.get("format").equals("fullfile")) { putFileEntry( db_util, hdfs, confArg.get("db_name_files"), confArg.get("file_id"), getFullPath(confArg), confArg.get("source")); } putRunEntry( db_util, confArg.get("db_name_runs"), confArg.get("run_id"), confArg.get("file_id"), confArg.get("type"), confArg.get("timestamp_real"), confArg.get("timestamp_stop"), getFullPath(confArg), confArg.get("delimiter")); hdfs.copyFromLocalFile(new Path(confArg.get("local_path")), new Path(getFullPath(confArg))); } else if (type_move == toFrom.REMOTE2LOCAL) { FileStatus[] files = hdfs.globStatus(new Path(getFullPath(confArg) + "*")); putRunEntry( db_util, confArg.get("db_name_runs"), confArg.get("run_id"), confArg.get("file_id"), confArg.get("type"), confArg.get("timestamp_real"), confArg.get("timestamp_stop"), getFullPath(confArg), confArg.get("delimiter")); unlock(lockName); for (FileStatus file : files) { Path cur_file = file.getPath(); Path cur_local_path = new Path(new String(confArg.get("local_path") + confArg.get("file_id"))); String suffix = getSuffix(getFileName(confArg), cur_file.getName()); if (suffix.length() > 0) { cur_local_path = cur_local_path.suffix(new String("." + suffix)); } if (confArg.get("copy").equals("true")) { String crc = hdfs.getFileChecksum(cur_file).toString(); if (checksumLocalTest(cur_local_path, crc)) { continue; } else { hdfs.copyToLocalFile(cur_file, cur_local_path); writeChecksum(cur_local_path, crc); } } else { System.out.println(cur_local_path + "\t" + cur_file); } } } unlock(lockName); return 0; }