/** * Generate the list of files and make them into FileSplits. This needs to be copied to insert a * filter on acceptable data */ @Override public List<InputSplit> getSplits(JobContext job) throws IOException { long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job)); long maxSize = getMaxSplitSize(job); long desiredMappers = job.getConfiguration().getLong("org.systemsbiology.jxtandem.DesiredXMLInputMappers", 0); // generate splits List<InputSplit> splits = new ArrayList<InputSplit>(); List<FileStatus> fileStatuses = listStatus(job); boolean forceNumberMappers = fileStatuses.size() == 1; for (FileStatus file : fileStatuses) { Path path = file.getPath(); if (!isPathAcceptable(path)) // filter acceptable data continue; FileSystem fs = path.getFileSystem(job.getConfiguration()); long length = file.getLen(); BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length); if ((length != 0) && isSplitable(job, path)) { long blockSize = file.getBlockSize(); // use desired mappers to force more splits if (forceNumberMappers && desiredMappers > 0) maxSize = Math.min(maxSize, (length / desiredMappers)); long splitSize = computeSplitSize(blockSize, minSize, maxSize); long bytesRemaining = length; while (withinSlop(splitSize, bytesRemaining)) { int blkIndex = getBlockIndex(blkLocations, length - bytesRemaining); splits.add( new FileSplit( path, length - bytesRemaining, splitSize, blkLocations[blkIndex].getHosts())); bytesRemaining -= splitSize; } if (bytesRemaining != 0) { splits.add( new FileSplit( path, length - bytesRemaining, bytesRemaining, blkLocations[blkLocations.length - 1].getHosts())); } } else if (length != 0) { splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts())); } else { // Create empty hosts array for zero length files splits.add(new FileSplit(path, 0, length, new String[0])); } } System.out.println("Total # of splits: " + splits.size()); // LOG.debug("Total # of splits: " + splits.size()); return splits; }
public static List<Double> GetPoint(String s) { StringTokenizer tokenizer = new StringTokenizer(s); List<Double> p = new ArrayList<Double>(58); while (tokenizer.hasMoreTokens()) { p.add(Double.parseDouble(tokenizer.nextToken())); } return p; }
public static List<String> tokenizeString(Analyzer analyzer, String string) { List<String> result = new ArrayList<String>(); try { TokenStream stream = analyzer.tokenStream(null, new StringReader(string)); stream.reset(); while (stream.incrementToken()) { result.add(stream.getAttribute(CharTermAttribute.class).toString()); } } catch (IOException e) { // not thrown b/c we're using a string reader... throw new RuntimeException(e); } return result; }
public void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { List<Double> newCentroid = null; int numPoints = 0; for (Text value : values) { ++numPoints; List<Double> p = GetPoint(value.toString()); if (newCentroid == null) { // initialize the new centroid to the first element newCentroid = new ArrayList<Double>(p); } else { for (int i = 0; i < newCentroid.size(); i++) { newCentroid.set(i, newCentroid.get(i) + p.get(i)); } } } // now the newCentroid contains the sum of all the points // so to get the average of all the points, we need to // divide each entry by the total number of points for (int i = 0; i < newCentroid.size(); i++) { newCentroid.set(i, newCentroid.get(i) / (double) numPoints); } // now create a string containing all the new centroid's coordinates String s = null; for (Double d : newCentroid) { if (s == null) { s = d.toString(); } else { s += " " + d.toString(); } } newCentroids.add(s); if (newCentroids.size() == 10) { WriteNewCentroids(context); } // output the centroid ID and the centroid data context.write(key, new Text(s)); }
protected void setup(Context context) throws IOException, InterruptedException { FileSystem fs = FileSystem.get(context.getConfiguration()); Path cFile = new Path(context.getConfiguration().get("CFILE")); DataInputStream d = new DataInputStream(fs.open(cFile)); BufferedReader reader = new BufferedReader(new InputStreamReader(d)); String line; while ((line = reader.readLine()) != null) { StringTokenizer tokenizer = new StringTokenizer(line.toString()); if (tokenizer.hasMoreTokens()) { List<Double> centroid = new ArrayList<Double>(58); while (tokenizer.hasMoreTokens()) { centroid.add(Double.parseDouble(tokenizer.nextToken())); } centroids.add(centroid); } } k = centroids.size(); }
public int run(String[] args) throws Exception { // printUsage(); /* * SETUP */ Configuration argConf = getConf(); Hashtable<String, String> confArg = new Hashtable<String, String>(); setup(confArg, argConf); Date currentTime = new Date(); Date endDate = new Date(new Long(confArg.get("timestamp_stop"))); Boolean full_run = confArg.get("intermediate").matches("(?i).*true.*"); Boolean quick_add = confArg.get("quick_add").matches("(?i).*true.*"); logger.info("Running GeStore"); // ZooKeeper setup Configuration config = HBaseConfiguration.create(); zkWatcher = new ZooKeeperWatcher(config, "Testing", new HBaseAdmin(config)); zkInstance = new ZooKeeper( ZKConfig.getZKQuorumServersString(config), config.getInt("zookeeper.session.timeout", -1), zkWatcher); if (!confArg.get("task_id").isEmpty()) { confArg.put("temp_path", confArg.get("temp_path") + confArg.get("task_id")); } String lockRequest = confArg.get("file_id"); if (!confArg.get("run_id").isEmpty()) lockRequest = lockRequest + "_" + confArg.get("run_id") + "_"; if (!confArg.get("task_id").isEmpty()) lockRequest = lockRequest + "_" + confArg.get("task_id") + "_"; // Get type of movement toFrom type_move = checkArgs(confArg); if (type_move == toFrom.LOCAL2REMOTE && !confArg.get("format").equals("unknown")) { List<String> arguments = new ArrayList<String>(); arguments.add("-Dinput=" + confArg.get("local_path")); arguments.add("-Dtable=" + confArg.get("file_id")); arguments.add("-Dtimestamp=" + confArg.get("timestamp_stop")); arguments.add("-Dtype=" + confArg.get("format")); arguments.add("-Dtarget_dir=" + confArg.get("base_path") + "_" + confArg.get("file_id")); arguments.add("-Dtemp_hdfs_path=" + confArg.get("temp_path")); arguments.add("-Drun_id=" + confArg.get("run_id")); if (!confArg.get("run_id").isEmpty()) arguments.add("-Drun_id=" + confArg.get("run_id")); if (!confArg.get("task_id").isEmpty()) arguments.add("-Dtask_id=" + confArg.get("task_id")); if (quick_add) arguments.add("-Dquick_add=" + confArg.get("quick_add")); String lockName = lock(lockRequest); String[] argumentString = arguments.toArray(new String[arguments.size()]); adddb.main(argumentString); unlock(lockName); System.exit(0); } // Database registration dbutil db_util = new dbutil(config); db_util.register_database(confArg.get("db_name_files"), true); db_util.register_database(confArg.get("db_name_runs"), true); db_util.register_database(confArg.get("db_name_updates"), true); FileSystem hdfs = FileSystem.get(config); FileSystem localFS = FileSystem.getLocal(config); // Get source type confArg.put("source", getSource(db_util, confArg.get("db_name_files"), confArg.get("file_id"))); confArg.put( "database", isDatabase(db_util, confArg.get("db_name_files"), confArg.get("file_id"))); if (!confArg.get("source").equals("local") && type_move == toFrom.REMOTE2LOCAL && !confArg.get("timestamp_stop").equals(Integer.toString(Integer.MAX_VALUE))) { confArg.put("timestamp_stop", Long.toString(latestVersion(confArg, db_util))); } /* * Get previous timestamp */ Get run_id_get = new Get(confArg.get("run_id").getBytes()); Result run_get = db_util.doGet(confArg.get("db_name_runs"), run_id_get); KeyValue run_file_prev = run_get.getColumnLatest( "d".getBytes(), (confArg.get("file_id") + "_db_timestamp").getBytes()); String last_timestamp = new String("0"); if (null != run_file_prev && !confArg.get("source").equals("local")) { long last_timestamp_real = run_file_prev.getTimestamp(); Long current_timestamp = new Long(confArg.get("timestamp_real")); if ((current_timestamp - last_timestamp_real) > 36000) { last_timestamp = new String(run_file_prev.getValue()); Integer lastTimestamp = new Integer(last_timestamp); lastTimestamp += 1; last_timestamp = lastTimestamp.toString(); logger.info("Last timestamp: " + last_timestamp + " End data: " + endDate); Date last_run = new Date(run_file_prev.getTimestamp()); if (last_run.before(endDate) && !full_run) { confArg.put("timestamp_start", last_timestamp); } } } Integer tse = new Integer(confArg.get("timestamp_stop")); Integer tss = new Integer(confArg.get("timestamp_start")); if (tss > tse) { logger.info("No new version of requested file."); return 0; } /* * Generate file */ String lockName = lock(lockRequest); Get file_id_get = new Get(confArg.get("file_id").getBytes()); Result file_get = db_util.doGet(confArg.get("db_name_files"), file_id_get); if (!file_get.isEmpty()) { boolean found = hasFile( db_util, hdfs, confArg.get("db_name_files"), confArg.get("file_id"), getFullPath(confArg)); if (confArg.get("source").equals("fullfile")) { found = false; } String filenames_put = getFileNames( db_util, confArg.get("db_name_files"), confArg.get("file_id"), getFullPath(confArg)); // Filename not found in file database if (!found && type_move == toFrom.REMOTE2LOCAL) { if (!confArg.get("source").equals("local")) { // Generate intermediate file if (getFile(hdfs, confArg, db_util) == null) { unlock(lockName); return 1; } // Put generated file into file database if (!confArg.get("format").equals("fullfile")) { putFileEntry( db_util, hdfs, confArg.get("db_name_files"), confArg.get("file_id"), confArg.get("full_file_name"), confArg.get("source")); } } else { logger.warn("Remote file not found, and cannot be generated! File: " + confArg); unlock(lockName); return 1; } } } else { if (type_move == toFrom.REMOTE2LOCAL) { logger.warn("Remote file not found, and cannot be generated."); unlock(lockName); return 1; } } /* * Copy file * Update tables */ if (type_move == toFrom.LOCAL2REMOTE) { if (!confArg.get("format").equals("fullfile")) { putFileEntry( db_util, hdfs, confArg.get("db_name_files"), confArg.get("file_id"), getFullPath(confArg), confArg.get("source")); } putRunEntry( db_util, confArg.get("db_name_runs"), confArg.get("run_id"), confArg.get("file_id"), confArg.get("type"), confArg.get("timestamp_real"), confArg.get("timestamp_stop"), getFullPath(confArg), confArg.get("delimiter")); hdfs.copyFromLocalFile(new Path(confArg.get("local_path")), new Path(getFullPath(confArg))); } else if (type_move == toFrom.REMOTE2LOCAL) { FileStatus[] files = hdfs.globStatus(new Path(getFullPath(confArg) + "*")); putRunEntry( db_util, confArg.get("db_name_runs"), confArg.get("run_id"), confArg.get("file_id"), confArg.get("type"), confArg.get("timestamp_real"), confArg.get("timestamp_stop"), getFullPath(confArg), confArg.get("delimiter")); unlock(lockName); for (FileStatus file : files) { Path cur_file = file.getPath(); Path cur_local_path = new Path(new String(confArg.get("local_path") + confArg.get("file_id"))); String suffix = getSuffix(getFileName(confArg), cur_file.getName()); if (suffix.length() > 0) { cur_local_path = cur_local_path.suffix(new String("." + suffix)); } if (confArg.get("copy").equals("true")) { String crc = hdfs.getFileChecksum(cur_file).toString(); if (checksumLocalTest(cur_local_path, crc)) { continue; } else { hdfs.copyToLocalFile(cur_file, cur_local_path); writeChecksum(cur_local_path, crc); } } else { System.out.println(cur_local_path + "\t" + cur_file); } } } unlock(lockName); return 0; }
public static List<String> getText(BytesWritable value, Boolean tokenizep) throws InterruptedException { Session s = Session.getDefaultInstance(new Properties()); InputStream is = new ByteArrayInputStream(value.getBytes()); List<String> out = new ArrayList<String>(); try { MimeMessage message = new MimeMessage(s, is); message.getAllHeaderLines(); Analyzer standard_analyzer = new StandardAnalyzer(Version.LUCENE_43); Analyzer email_analyzer = new UAX29URLEmailAnalyzer(Version.LUCENE_43); Address[] fromAddrs = message.getFrom(); String fromAddrstr = ""; if (fromAddrs != null) { for (Address addr : fromAddrs) { fromAddrstr += (addr.toString() + " "); } } Address[] toAddrs = message.getAllRecipients(); String toAddrstr = ""; if (toAddrs != null) { for (Address addr : toAddrs) { toAddrstr += (addr.toString() + " "); } } String subject = message.getSubject(); String body = ""; try { Object content = message.getContent(); // System.err.println(content.getContentType()); if (content instanceof String) { body = (String) content; } else if (content instanceof Multipart) { Multipart mp = (Multipart) content; for (int i = 0; i < mp.getCount(); i++) { BodyPart bp = mp.getBodyPart(i); // System.err.println(bp.getContentType()); Object c = bp.getContent(); if (c instanceof String) { body = (String) c; } } } // people do really evil things with email, we're not sorting through it all now } catch (DecodingException e) { System.err.println("DecodingException"); } catch (UnsupportedEncodingException e) { System.err.println("UnsuportedEncodingException"); } catch (IOException e) { System.err.println("IOException"); } if (tokenizep) { List<String> fromData = new ArrayList<String>(); List<String> toData = new ArrayList<String>(); List<String> subjectData = new ArrayList<String>(); List<String> bodyData = new ArrayList<String>(); if (fromAddrstr != null) { fromData = tokenizeString(email_analyzer, fromAddrstr); } if (toAddrstr != null) { toData = tokenizeString(email_analyzer, toAddrstr); } if (subject != null) { subjectData = tokenizeString(standard_analyzer, subject); } if (body != null) { bodyData = tokenizeString(standard_analyzer, body); } out.add("FROM "); out.addAll(fromData); out.add("TO "); out.addAll(toData); out.add("SUBJECT "); out.addAll(subjectData); out.add("BODY "); out.addAll(bodyData); } else { // if not tokenizep, return list with from and subject fields only out.add(fromAddrstr); out.add(subject); } } catch (MessagingException e) { System.err.println("MessagineException"); } return out; }