public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String cur_file = ((FileSplit) context.getInputSplit()).getPath().getParent().getParent().getName(); String train_file = context.getConfiguration().get("train_file"); if (cur_file.equals(train_file)) { StringTokenizer st = new StringTokenizer(value.toString()); String word = st.nextToken(); String f_id = st.nextToken(); myKey.set(word); myVal.set(f_id); context.write(myKey, myVal); } else { StringTokenizer st = new StringTokenizer(value.toString()); String word = st.nextToken(); String f_id = st.nextToken(); StringBuilder builder = new StringBuilder(dlt); while (st.hasMoreTokens()) { String filename = st.nextToken(); String tf_idf = st.nextToken(); builder.append(filename); builder.append(dlt); builder.append(tf_idf); builder.append("\t"); } myKey.set(word); myVal.set(builder.toString()); context.write(myKey, myVal); } }
// specify input and out keys public void map( LongWritable key, Text value, OutputCollector<Text, Text> output, Reporter reporter) throws IOException { String line = value.toString(); // define new variable to be string ArrayList<Integer> range = new ArrayList<Integer>(); for (int i = 2000; i <= 2010; i++) { range.add(i); } // String[] inputs = line.split(",(?=([^\"]*\"[^\"]*\")*[^\"]*$)"); String[] inputs = line.split(","); try { int year = Integer.parseInt(inputs[165]); if (range.contains(year)) { String dur = inputs[3]; String artist_name = inputs[2]; String song_title = inputs[1]; String final_input = artist_name + ',' + dur + ',' + song_title; Final_Value.set(final_input); output.collect(Final_Value, dummy); } } catch (NumberFormatException e) { // do nothing } }
public void map( LongWritable key, Text value, OutputCollector<IntWritable, DoubleWritable> output, Reporter reporter) throws IOException { /* * It implements the mapper. It outputs the numbers of weight and updated weights. * * Note that the format of intermediate output is <IntWritable, DoubleWritable>, * because the key is the number of weight (an integer), and the value is the weight's value (double) */ inputData = value.toString(); // go through the process initialize(); getposphase(); getnegphase(); update(); // output the intermediate data // The <key, value> pairs are <weightID, weightUpdate> double[][] vishidinc_array = vishidinc.getArray(); for (int i = 0; i < numdims; i++) { for (int j = 0; j < numhid; j++) { weightPos.set(i * numhid + j); weightValue.set(vishidinc_array[i][j]); output.collect(weightPos, weightValue); } } }
// Identity public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { System.out.println("I'm in Job1 reduce"); for (Text val : values) { System.out.println("job1:redInp:-" + val.toString()); context.write(new Text(""), val); } }
public void reduce( Text key, Iterator<CrawlDatum> values, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { boolean oldSet = false; boolean injectedSet = false; while (values.hasNext()) { CrawlDatum val =; if (val.getStatus() == CrawlDatum.STATUS_INJECTED) { injected.set(val); injected.setStatus(CrawlDatum.STATUS_DB_UNFETCHED); injectedSet = true; } else { old.set(val); oldSet = true; } } CrawlDatum res = null; /** * Whether to overwrite, ignore or update existing records * * @see */ // Injected record already exists and overwrite but not update if (injectedSet && oldSet && overwrite) { res = injected; if (update) { + " overwritten with injected record but update was specified."); } } // Injected record already exists and update but not overwrite if (injectedSet && oldSet && update && !overwrite) { res = old; old.putAllMetaData(injected); old.setScore(injected.getScore() != scoreInjected ? injected.getScore() : old.getScore()); old.setFetchInterval( injected.getFetchInterval() != interval ? injected.getFetchInterval() : old.getFetchInterval()); } // Old default behaviour if (injectedSet && !oldSet) { res = injected; } else { res = old; } output.collect(key, res); }
public void map( IntWritable key, Text value, OutputCollector<IntWritable, Text> output, Reporter reporter) throws IOException { String dataRow = value.toString(); StringTokenizer tk = new StringTokenizer(dataRow); String label = tk.nextToken(); String image = tk.nextToken(); dataString.set(label + "\t" + image); output.collect(sameKey, dataString); }
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // read in a document (point in 58-dimensional space) List<Double> p = GetPoint(value.toString()); int idxClosestCentoid = IndexOfClosestCentroid(p); context.write(new IntWritable(idxClosestCentoid), value); }
@Override public void map(Object key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); int movieId = Integer.parseInt(tokenizer.nextToken()); while (tokenizer.hasMoreTokens()) { String word = tokenizer.nextToken(); context.write(new Text("1"), new IntWritable(1)); } }
/* * Finds a full file and sets it as the value. */ public synchronized boolean next(LongWritable key, Text value) throws IOException { Text line = new Text(); boolean retrieved = true; String result = ""; value.clear(); while (retrieved) { retrieved =, line); if (line.toString().length() > 0) { String lineValue = line.toString(); result += lineValue + "\n"; } } value.set(result); return true; }
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { String input[]; double result = 0.0; /* adds all the values corresponding to a key */ for (Text value : values) { result += Double.parseDouble(value.toString()); } context.write(null, new Text(key + "," + Double.toString(result))); }
/** Implement readFields of Writable */ public void readFields(DataInput in) throws IOException { this.offset = in.readLong(); this.length = in.readLong(); int numNames = in.readInt(); this.names = new String[numNames]; for (int i = 0; i < numNames; i++) { Text name = new Text(); name.readFields(in); names[i] = name.toString(); } int numHosts = in.readInt(); for (int i = 0; i < numHosts; i++) { Text host = new Text(); host.readFields(in); hosts[i] = host.toString(); } int numTops = in.readInt(); Text path = new Text(); for (int i = 0; i < numTops; i++) { path.readFields(in); topologyPaths[i] = path.toString(); } }
public void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { List<Double> newCentroid = null; int numPoints = 0; for (Text value : values) { ++numPoints; List<Double> p = GetPoint(value.toString()); if (newCentroid == null) { // initialize the new centroid to the first element newCentroid = new ArrayList<Double>(p); } else { for (int i = 0; i < newCentroid.size(); i++) { newCentroid.set(i, newCentroid.get(i) + p.get(i)); } } } // now the newCentroid contains the sum of all the points // so to get the average of all the points, we need to // divide each entry by the total number of points for (int i = 0; i < newCentroid.size(); i++) { newCentroid.set(i, newCentroid.get(i) / (double) numPoints); } // now create a string containing all the new centroid's coordinates String s = null; for (Double d : newCentroid) { if (s == null) { s = d.toString(); } else { s += " " + d.toString(); } } newCentroids.add(s); if (newCentroids.size() == 10) { WriteNewCentroids(context); } // output the centroid ID and the centroid data context.write(key, new Text(s)); }
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { Configuration c = context.getConfiguration(); String s = value.toString(); String input[] = s.split(","); Text outputkey = new Text(); Text outputvalue = new Text(); double result = 0.0; /* multiplies matrix and vector entry with matching column value */ result = (Double.parseDouble(input[2])) * (vector.get(Long.parseLong(input[1]))); outputkey.set(input[0]); outputvalue.set(Double.toString(result)); context.write(outputkey, outputvalue); }
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String str = "1\t" + value.toString(); Kvector vector = Kvector.fromString(str); int code = -1, d = Integer.MAX_VALUE; for (Kvector e : m_kVectors) { int t = vector.distance(e); if (t < d) { d = t; code = e.code; } } context.write(new KcodeWritable(code), new Text(str)); }
public void reduce(IntWritable key, Iterable<Text> values, Context context) throws IOException, InterruptedException { System.out.println(PREFIX + "Collecting all the matched results"); for (Text val : values) { String[] tmp = val.toString() .split("\\|"); // The \\ here is very important. Cannot use "|" since split() // need a regex(regular expression), and the vertical bar is // special character. System.out.println("filename:" + tmp[0] + " ratio:" + tmp[1]); String filename = tmp[0]; double ratio = Double.valueOf(tmp[1]); // Key:filename Value:ratio context.write(new Text(filename), new DoubleWritable(ratio)); } }
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] tokens = line.split("\\s"); String hour = new String(); long numRequests = 0; if (tokens.length > 4) { // get the project name field hour = tokens[4]; // get the number of requests field numRequests = Long.parseLong(tokens[2]); // output the key, value pairs where the key is a combination of // project name and datetime and the value is number of requests context.write(new Text(hour), new LongWritable(numRequests)); } }
public void map( LongWritable key, Text value, OutputCollector<IntWritable, DoubleWritable> output, Reporter reporter) throws IOException { String line = value.toString(); StringTokenizer tokenizer = new StringTokenizer(line); int rowIdx = 0; double xValue = 0; if (tokenizer.hasMoreTokens()) { rowIdx = Integer.parseInt(tokenizer.nextToken()); xValue = Double.parseDouble(tokenizer.nextToken()); } double xResult = (resVec[rowIdx] - (sumVec[rowIdx] * xValue)) / diaVec[rowIdx]; output.collect(new IntWritable(rowIdx), new DoubleWritable(xResult)); }
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { String[] pair = new String[2]; int count = 0; for (Text txt : values) { pair[count] = txt.toString(); count++; } // word exists in training if (count == 2) { StringTokenizer st_one, st_two; if (pair[0].contains(dlt)) { st_one = new StringTokenizer(pair[1]); st_two = new StringTokenizer(pair[0]); } else { st_one = new StringTokenizer(pair[0]); st_two = new StringTokenizer(pair[1]); } // outputting the data String f_id = st_one.nextToken(); StringBuilder builder = new StringBuilder(dlt); builder.append(f_id); builder.append(dlt); while (st_two.hasMoreTokens()) { String filename = st_two.nextToken(); String tf_idf = st_two.nextToken(); builder.append(filename); builder.append(dlt); builder.append(tf_idf); builder.append("\t"); } myVal.set(builder.toString()); context.write(key, myVal); } }
/* Finds a full sentence and sets it as the value. * If the sentence is shorter than the full line, the rest is stored to use later. */ public synchronized boolean next(LongWritable key, Text value) throws IOException { Text line = new Text(); boolean getMore = true; boolean retrieved = false; String result = leftovers; leftovers = ""; value.clear(); while (getMore) { retrieved =, line); if (retrieved) { String lineValue = line.toString(); // here, we assume sentences run until the period. int endOfSentence = lineValue.indexOf('.'); if (endOfSentence == -1) { result += " " + lineValue; } else { result += " " + lineValue.substring(0, endOfSentence + 1); leftovers = lineValue.substring(endOfSentence + 1); getMore = false; } } else { getMore = false; value.set(result); return false; } } value.set(result); return true; }
// The input video files are split into chunks of 64MB here... public void map(Object key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); System.out.println("job1:mapInp:-" + line); String[] info = line.split(" "); info[0] = info[0].trim(); info[1] = info[1].trim(); String lstfnames = "", fname = ""; try { Configuration config = new Configuration(); FileSystem hdfs = FileSystem.get(config); String prefixPath = "", fnm = ""; Pattern x = Pattern.compile("(.*)/(.*)"); Matcher xm = x.matcher(info[0]); while (xm.find()) { prefixPath =; fnm =; } String dst = "/home/" + fnm; // dst is path of the file on local system. hdfs.copyToLocalFile(new Path(info[0]), new Path(dst)); Process p = Runtime.getRuntime().exec("ffmpeg -i " + dst); String s; BufferedReader stdError = new BufferedReader(new InputStreamReader(p.getErrorStream())); Pattern D = Pattern.compile("Duration:[ ]*([0-9]+):([0-9]+):([0-9]+)"); long time = 0; // "time" is the duration of the input video file long sps = 0; // "sps" is the number of seconds(duration) of each video split while ((s = stdError.readLine()) != null) { Matcher md = D.matcher(s); while (md.find()) { time = Long.parseLong( * 3600 + Long.parseLong( * 60 + Long.parseLong(; } } Process p1 = Runtime.getRuntime().exec("du -s " + dst); BufferedReader stdInput1 = new BufferedReader(new InputStreamReader(p1.getInputStream())); String s1 = "", size = ""; // "size" is the size of input video file while ((s1 = stdInput1.readLine()) != null) { String s11[] = s1.split("\t"); size = s11[0]; } sps = (64 * 1024) * time / (Long.parseLong(size)); // chunk size is 64MB String hr, min, sc; hr = Long.toString((sps / 3600)); min = Long.toString((sps % 3600) / 60); sc = Long.toString(sps % 60); if (hr.length() < 2) hr = "0" + hr; if (min.length() < 2) min = "0" + min; if (sc.length() < 2) sc = "0" + sc; String splt = hr + ":" + min + ":" + sc; String query = "mencoder -oac copy -ovc copy -ss "; // building query to split the input video file String app = "", inpExt = ""; Pattern xx = Pattern.compile("(.*)\\.(.*)"); Matcher xxm = xx.matcher(dst); while (xxm.find()) { fname =; inpExt =; } String[] tmpArr = fname.split("/"); String hdfsFname = ""; long stSrt = 0; int cnt = 0; while (true) { if (stSrt > time) break; if (stSrt + sps > time) { long t = time - stSrt; hr = Long.toString((t / 3600)); min = Long.toString((t % 3600) / 60); sc = Long.toString(t % 60); if (hr.length() < 2) hr = "0" + hr; if (min.length() < 2) min = "0" + min; if (sc.length() < 2) sc = "0" + sc; splt = hr + ":" + min + ":" + sc; } cnt++; hr = Long.toString((stSrt / 3600)); min = Long.toString((stSrt % 3600) / 60); sc = Long.toString(stSrt % 60); if (hr.length() < 2) hr = "0" + hr; if (min.length() < 2) min = "0" + min; if (sc.length() < 2) sc = "0" + sc; app = hr + ":" + min + ":" + sc + " -endPos " + splt + " " + dst + " -o " + fname + "_" + Integer.toString(cnt) + "." + inpExt; Process p2 = Runtime.getRuntime().exec(query + app); String ls_str = ""; DataInputStream ls_in = new DataInputStream(p2.getInputStream()); while ((ls_str = ls_in.readLine()) != null) {} p2.destroy(); String[] tmpArr1 = fnm.split("\\."); hdfs.copyFromLocalFile( true, true, new Path(fname + "_" + Integer.toString(cnt) + "." + inpExt), new Path(prefixPath + "/" + tmpArr1[0] + "_" + Integer.toString(cnt) + "." + inpExt)); lstfnames += prefixPath + "/" + tmpArr1[0] + "_" + Integer.toString(cnt) + "." + inpExt + " #!# "; stSrt += sps; } Runtime rt1 = Runtime.getRuntime(); String[] cmd1 = {"/bin/bash", "-c", "rm " + dst}; // delete the file after use Process pr1 = rt1.exec(cmd1); pr1.waitFor(); lstfnames += "*" + info[1]; context.write( new Text(fname), new Text( lstfnames)); // "fname" contains name of the input video file with // extension(eg.".avi") removed #### "lstfnames" is a string, contains // all the names of video splits(concatenated) System.out.println("lstfnames : " + lstfnames); } catch (IOException e) { System.out.println("exception happened - here's what I know: "); e.printStackTrace(); System.exit(-1); } }
public void map(Text key, Text value, Context context) throws InterruptedException, IOException { String filename = key.toString(); String json = value.toString(); // Make sure the input is valid if (!(filename.isEmpty() || json.isEmpty())) { // Change the json-type feature to Mat-type feature Mat descriptor = json2mat(json); if (descriptor != null) { // Read the query feature from the cache in Hadoop Mat query_features; String pathStr = context.getConfiguration().get("featureFilePath"); FileSystem fs = FileSystem.get(context.getConfiguration()); FSDataInputStream fsDataInputStream = Path(pathStr)); StringBuilder sb = new StringBuilder(); // Use a buffer to read the query_feature int remain = fsDataInputStream.available(); while (remain > 0) { int read; byte[] buf = new byte[BUF_SIZE]; read =, fsDataInputStream.available() - remain, BUF_SIZE); sb.append(new String(buf, 0, read, StandardCharsets.UTF_8)); remain = remain - read; System.out.println("remain:" + remain + "\tread:" + read + "\tsb.size:" + sb.length()); } // Read the query_feature line by line // Scanner sc = new Scanner(fsDataInputStream, "UTF-8"); // StringBuilder sb = new StringBuilder(); // while (sc.hasNextLine()) { // sb.append(sc.nextLine()); // } // String query_json = sb.toString(); // String query_json = new String(buf, StandardCharsets.UTF_8); String query_json = sb.toString(); fsDataInputStream.close(); query_features = json2mat(query_json); // Get the similarity of the current database image against the query image DescriptorMatcher matcher = DescriptorMatcher.create(DescriptorMatcher.FLANNBASED); MatOfDMatch matches = new MatOfDMatch(); // Ensure the two features have same length of cols (the feature extracted are all 128 // cols(at least in this case)) if (query_features.cols() == descriptor.cols()) { matcher.match(query_features, descriptor, matches); DMatch[] dMatches = matches.toArray(); // Calculate the max/min distances // double max_dist = Double.MAX_VALUE; // double min_dist = Double.MIN_VALUE; double max_dist = 0; double min_dist = 100; for (int i = 0; i < dMatches.length; i++) { double dist = dMatches[i].distance; if (min_dist > dist) min_dist = dist; if (max_dist < dist) max_dist = dist; } // Only distances ≤ threshold are good matches double threshold = max_dist * THRESHOLD_FACTOR; // double threshold = min_dist * 2; LinkedList<DMatch> goodMatches = new LinkedList<DMatch>(); for (int i = 0; i < dMatches.length; i++) { if (dMatches[i].distance <= threshold) { goodMatches.addLast(dMatches[i]); } } // Get the ratio of good_matches to all_matches double ratio = (double) goodMatches.size() / (double) dMatches.length; System.out.println("*** current_record_filename:" + filename + " ***"); System.out.println("feature:" + descriptor + "\nquery_feature:" + query_features); System.out.println( "min_dist of keypoints:" + min_dist + " max_dist of keypoints:" + max_dist); System.out.println( "total_matches:" + dMatches.length + "\tgood_matches:" + goodMatches.size()); // System.out.println("type:" + descriptor.type() + " channels:" + // descriptor.channels() + " rows:" + descriptor.rows() + " cols:" + descriptor.cols()); // System.out.println("qtype:" + query_features.type() + " // qchannels:" + query_features.channels() + " qrows:" + query_features.rows() + " // qcols:" + query_features.cols()); System.out.println(); if (ratio > PERCENTAGE_THRESHOLD) { // Key:1 Value:filename|ratio context.write(ONE, new Text(filename + "|" + ratio)); // context.write(ONE, new Text(filename + "|" + // String.valueOf(goodMatches.size()))); } } else { System.out.println("The size of the features are not equal"); } } else { // a null pointer, do nothing System.out.println("A broken/null feature:" + filename); System.out.println(); } } }
// ## operation readReactorOutputFile(ReactionModel) public SystemSnapshot readReactorOutputFile(ReactionModel p_reactionModel) { // #[ operation readReactorOutputFile(ReactionModel) try { // open output file and build the DOM tree String dir = System.getProperty("RMG.workingDirectory"); String filename = "chemkin/reactorOutput.xml"; File inputFile = new File(filename); DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); factory.setValidating(true); // validate the document with the DTD factory.setIgnoringElementContentWhitespace(true); // ignore whitespace DocumentBuilder builder = factory.newDocumentBuilder(); Document doc = builder.parse(inputFile); // get root element and its children Element root = doc.getDocumentElement(); NodeList rootchildren = root.getChildNodes(); // header is rootchildren.item(0) // get return message and check for successful run Element returnmessageElement = (Element) rootchildren.item(1); Text returnmessageText = (Text) returnmessageElement.getFirstChild(); String returnmessage = returnmessageText.toString(); returnmessage = returnmessage.trim(); if (!returnmessage.contains("SUCCESSFULLY COMPLETED RUN.")) { System.out.println("External reactor model failed!"); System.out.println("Reactor model error message: " + returnmessage); System.exit(0); } // get outputvalues element and its children Element outputvaluesElement = (Element) rootchildren.item(2); NodeList children = outputvaluesElement.getChildNodes(); // get time Element timeElement = (Element) children.item(0); Text timeText = (Text) timeElement.getFirstChild(); double time = Double.parseDouble(timeText.getData()); String timeUnits = timeElement.getAttribute("units"); // get systemstate element and its children Element systemstateElement = (Element) children.item(1); NodeList states = systemstateElement.getChildNodes(); // get temperature and its units Element temperatureElement = (Element) states.item(0); String tempUnits = temperatureElement.getAttribute("units"); Text temperatureText = (Text) temperatureElement.getFirstChild(); double temp = Double.parseDouble(temperatureText.getData()); Temperature T = new Temperature(temp, tempUnits); // get pressure and its units Element pressureElement = (Element) states.item(1); String presUnits = pressureElement.getAttribute("units"); Text pressureText = (Text) pressureElement.getFirstChild(); double pres = Double.parseDouble(pressureText.getData()); Pressure P = new Pressure(pres, presUnits); // get species amounts (e.g. concentrations) ArrayList speciesIDs = new ArrayList(); ArrayList amounts = new ArrayList(); ArrayList fluxes = new ArrayList(); String amountUnits = null; String fluxUnits = null; // loop thru all the species // begin at i=2, since T and P take already the first two position of states int nSpe = (states.getLength() - 2) / 2; int index = 0; LinkedHashMap inertGas = new LinkedHashMap(); for (int i = 2; i < nSpe + 2; i++) { // get amount element and the units Element amountElement = (Element) states.item(i); amountUnits = amountElement.getAttribute("units"); Element fluxElement = (Element) states.item(i + nSpe); fluxUnits = fluxElement.getAttribute("units"); // get speciesid and store in an array list String thisSpeciesID = amountElement.getAttribute("speciesid"); // get amount (e.g. concentraion) and store in an array list Text amountText = (Text) amountElement.getFirstChild(); double thisAmount = Double.parseDouble(amountText.getData()); if (thisAmount < 0) { double aTol = ReactionModelGenerator.getAtol(); // if (Math.abs(thisAmount) < aTol) thisAmount = 0; // else throw new NegativeConcentrationException("Negative concentration in // reactorOutput.xml: " + thisSpeciesID); if (thisAmount < -100.0 * aTol) throw new NegativeConcentrationException( "Species " + thisSpeciesID + " has negative concentration: " + String.valueOf(thisAmount)); } // get amount (e.g. concentraion) and store in an array list Text fluxText = (Text) fluxElement.getFirstChild(); double thisFlux = Double.parseDouble(fluxText.getData()); if (thisSpeciesID.compareToIgnoreCase("N2") == 0 || thisSpeciesID.compareToIgnoreCase("Ne") == 0 || thisSpeciesID.compareToIgnoreCase("Ar") == 0) { inertGas.put(thisSpeciesID, new Double(thisAmount)); } else { speciesIDs.add(index, thisSpeciesID); amounts.add(index, new Double(thisAmount)); fluxes.add(index, new Double(thisFlux)); index++; } } // print results for debugging purposes /** * System.out.println(returnmessage); System.out.println("Temp = " + temp + " " + tempUnits); * System.out.println("Pres = " + pres + " " + presUnits); for (int i = 0; i < amounts.size(); * i++) { System.out.println(speciesIDs.get(i) + " " + amounts.get(i) + " " + amountUnits); } */ ReactionTime rt = new ReactionTime(time, timeUnits); LinkedHashMap speStatus = generateSpeciesStatus(p_reactionModel, speciesIDs, amounts, fluxes); SystemSnapshot ss = new SystemSnapshot(rt, speStatus, T, P); ss.inertGas = inertGas; return ss; } catch (Exception e) { System.out.println("Error reading reactor model output: " + e.getMessage()); System.exit(0); return null; } // #] }
/** Delete the dst files/dirs which do not exist in src */ private static void deleteNonexisting( FileSystem dstfs, FileStatus dstroot, Path dstsorted, FileSystem jobfs, Path jobdir, JobConf jobconf, Configuration conf) throws IOException { if (!dstroot.isDir()) { throw new IOException( "dst must be a directory when option " + Options.DELETE.cmd + " is set, but dst (= " + dstroot.getPath() + ") is not a directory."); } // write dst lsr results final Path dstlsr = new Path(jobdir, "_distcp_dst_lsr"); final SequenceFile.Writer writer = SequenceFile.createWriter( jobfs, jobconf, dstlsr, Text.class, FileStatus.class, SequenceFile.CompressionType.NONE); try { // do lsr to get all file statuses in dstroot final Stack<FileStatus> lsrstack = new Stack<FileStatus>(); for (lsrstack.push(dstroot); !lsrstack.isEmpty(); ) { final FileStatus status = lsrstack.pop(); if (status.isDir()) { for (FileStatus child : dstfs.listStatus(status.getPath())) { String relative = makeRelative(dstroot.getPath(), child.getPath()); writer.append(new Text(relative), child); lsrstack.push(child); } } } } finally { checkAndClose(writer); } // sort lsr results final Path sortedlsr = new Path(jobdir, "_distcp_dst_lsr_sorted"); SequenceFile.Sorter sorter = new SequenceFile.Sorter( jobfs, new Text.Comparator(), Text.class, FileStatus.class, jobconf); sorter.sort(dstlsr, sortedlsr); // compare lsr list and dst list SequenceFile.Reader lsrin = null; SequenceFile.Reader dstin = null; try { lsrin = new SequenceFile.Reader(jobfs, sortedlsr, jobconf); dstin = new SequenceFile.Reader(jobfs, dstsorted, jobconf); // compare sorted lsr list and sorted dst list final Text lsrpath = new Text(); final FileStatus lsrstatus = new FileStatus(); final Text dstpath = new Text(); final Text dstfrom = new Text(); final FsShell shell = new FsShell(conf); final String[] shellargs = {"-rmr", null}; boolean hasnext =, dstfrom); for (;, lsrstatus); ) { int dst_cmp_lsr = dstpath.compareTo(lsrpath); for (; hasnext && dst_cmp_lsr < 0; ) { hasnext =, dstfrom); dst_cmp_lsr = dstpath.compareTo(lsrpath); } if (dst_cmp_lsr == 0) { // lsrpath exists in dst, skip it hasnext =, dstfrom); } else { // lsrpath does not exist, delete it String s = new Path(dstroot.getPath(), lsrpath.toString()).toString(); if (shellargs[1] == null || !isAncestorPath(shellargs[1], s)) { shellargs[1] = s; int r = 0; try { r =; } catch (Exception e) { throw new IOException("Exception from shell.", e); } if (r != 0) { throw new IOException( "\"" + shellargs[0] + " " + shellargs[1] + "\" returns non-zero value " + r); } } } } } finally { checkAndClose(lsrin); checkAndClose(dstin); } }
// Video splts are converted to target format here... public void map(Object key, Text value, Context context) throws IOException, InterruptedException { try { Configuration config = new Configuration(); FileSystem hdfs = FileSystem.get(config); String st = value.toString(); st = st.trim(); System.out.println("job2:mapInp:-" + st); String[] fmt = st.split(" #!# \\*"); String[] lst = fmt[0].split(" #!# "); String out = "", dlt = ""; int flag = 1; for (String st1 : lst) { Pattern x = Pattern.compile("(.*)/(.*)"); Matcher xm = x.matcher(st1); String prefixPath = "", fnm = "", inpExt = ""; while (xm.find()) { prefixPath =; fnm =; } String[] tmpArr = fnm.split("\\."); fnm = tmpArr[0]; inpExt = tmpArr[1]; hdfs.copyToLocalFile(true, new Path(st1), new Path("/home/" + fnm + "." + inpExt)); String fname = "/home/" + fnm; if (flag == 1) { flag = 0; out += prefixPath + "/" + fnm + "." + fmt[1]; } else { out += " #!# " + prefixPath + "/" + fnm + "." + fmt[1]; } if (fmt[1].equals("mpg") || fmt[1].equals("mpeg") || fmt[1].equals("mp4")) { Process p = Runtime.getRuntime() .exec( "mencoder -of mpeg -ovc lavc -lavcopts vcodec=mpeg1video -oac copy " + "/home/" + fnm + "." + inpExt + " -o " + fname + "." + fmt[1]); String ls_str = ""; DataInputStream ls_in = new DataInputStream(p.getInputStream()); while ((ls_str = ls_in.readLine()) != null) {} p.destroy(); dlt += " /home/" + fnm + "." + inpExt; } else if (fmt[1].equals("avi")) { Process p = Runtime.getRuntime() .exec( "mencoder -ovc lavc -oac mp3lame -o " + fname + "." + fmt[1] + " " + "/home/" + fnm + "." + inpExt); String ls_str = ""; DataInputStream ls_in = new DataInputStream(p.getInputStream()); while ((ls_str = ls_in.readLine()) != null) {} p.destroy(); dlt += " /home/" + fnm + "." + inpExt; } else { // TBD System.out.println("Unsupported target format!!!!!"); } hdfs.copyFromLocalFile( true, true, new Path(fname + "." + fmt[1]), new Path(prefixPath + "/" + fnm + "." + fmt[1])); } Runtime rt1 = Runtime.getRuntime(); String[] cmd1 = {"/bin/bash", "-c", "rm" + dlt}; // delete the files after use Process pr1 = rt1.exec(cmd1); pr1.waitFor(); System.out.println("Job2 mapOut:" + out); context.write(new Text(lst[0]), new Text(out)); System.out.println(out); } catch (IOException e) { System.out.println("exception happened - here's what I know: "); e.printStackTrace(); System.exit(-1); } }
public void map( WritableComparable<?> key, Text value, OutputCollector<Text, CrawlDatum> output, Reporter reporter) throws IOException { String url = value.toString(); // value is line of text if (url != null && url.trim().startsWith("#")) { /* Ignore line that start with # */ return; } // if tabs : metadata that could be stored // must be name=value and separated by \t float customScore = -1f; int customInterval = interval; int fixedInterval = -1; Map<String, String> metadata = new TreeMap<String, String>(); if (url.indexOf("\t") != -1) { String[] splits = url.split("\t"); url = splits[0]; for (int s = 1; s < splits.length; s++) { // find separation between name and value int indexEquals = splits[s].indexOf("="); if (indexEquals == -1) { // skip anything without a = continue; } String metaname = splits[s].substring(0, indexEquals); String metavalue = splits[s].substring(indexEquals + 1); if (metaname.equals(nutchScoreMDName)) { try { customScore = Float.parseFloat(metavalue); } catch (NumberFormatException nfe) { } } else if (metaname.equals(nutchFetchIntervalMDName)) { try { customInterval = Integer.parseInt(metavalue); } catch (NumberFormatException nfe) { } } else if (metaname.equals(nutchFixedFetchIntervalMDName)) { try { fixedInterval = Integer.parseInt(metavalue); } catch (NumberFormatException nfe) { } } else metadata.put(metaname, metavalue); } } try { url = urlNormalizers.normalize(url, URLNormalizers.SCOPE_INJECT); url = filters.filter(url); // filter the url } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Skipping " + url + ":" + e); } url = null; } if (url == null) { reporter.getCounter("injector", "urls_filtered").increment(1); } else { // if it passes value.set(url); // collect it CrawlDatum datum = new CrawlDatum(); datum.setStatus(CrawlDatum.STATUS_INJECTED); // Is interval custom? Then set as meta data if (fixedInterval > -1) { // Set writable using float. Flaot is used by AdaptiveFetchSchedule datum .getMetaData() .put(Nutch.WRITABLE_FIXED_INTERVAL_KEY, new FloatWritable(fixedInterval)); datum.setFetchInterval(fixedInterval); } else { datum.setFetchInterval(customInterval); } datum.setFetchTime(curTime); // now add the metadata Iterator<String> keysIter = metadata.keySet().iterator(); while (keysIter.hasNext()) { String keymd =; String valuemd = metadata.get(keymd); datum.getMetaData().put(new Text(keymd), new Text(valuemd)); } if (customScore != -1) datum.setScore(customScore); else datum.setScore(scoreInjected); try { scfilters.injectedScore(value, datum); } catch (ScoringFilterException e) { if (LOG.isWarnEnabled()) { LOG.warn( "Cannot filter injected score for url " + url + ", using default (" + e.getMessage() + ")"); } } reporter.getCounter("injector", "urls_injected").increment(1); output.collect(value, datum); } }
public void write(DataOutput out) throws IOException { out.writeUTF(leftBigram.toString()); out.writeUTF(rightBigram.toString()); }
// merge the converted files here public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { System.out.println("I'm in Job2 reduce"); Configuration config = new Configuration(); FileSystem hdfs = FileSystem.get(config); try { String out = ""; for (Text t : values) { out = t.toString(); out = out.trim(); System.out.println("job2:redInp:-" + out); break; } String[] outl = out.split(" #!# "); Pattern x = Pattern.compile("(.*)/(.*)\\.(.*)"); Matcher xm = x.matcher(outl[0]); String prefixPath = "", fnm = "", ext = ""; while (xm.find()) { prefixPath =; fnm =; ext =; } String foutname = fnm.split("_")[0]; foutname += "." + ext; String query = "mencoder -oac copy -ovc copy"; int cnt = 0; for (String st : outl) { cnt++; hdfs.copyToLocalFile( true, new Path(st), new Path("/home/" + fnm.split("_")[0] + "_" + Integer.toString(cnt) + "." + ext)); query += " " + "/home/" + fnm.split("_")[0] + "_" + Integer.toString(cnt) + "." + ext; } query += " -o " + "/home/" + foutname; Process p2 = Runtime.getRuntime().exec(query); // query for merging the video files is executed here String ls_str = ""; DataInputStream ls_in = new DataInputStream(p2.getInputStream()); while ((ls_str = ls_in.readLine()) != null) {} p2.destroy(); hdfs.copyFromLocalFile( true, true, new Path("/home/" + foutname), new Path(prefixPath + "/" + foutname)); cnt = 0; String dlt1 = ""; for (String st3 : outl) { cnt++; dlt1 += " " + "/home/" + fnm.split("_")[0] + "_" + Integer.toString(cnt) + "." + ext; } Runtime rt1 = Runtime.getRuntime(); String[] cmd1 = {"/bin/bash", "-c", "rm" + dlt1}; // delete the files after use Process pr1 = rt1.exec(cmd1); pr1.waitFor(); context.write(new Text(""), new Text(prefixPath + "/" + foutname)); } catch (IOException e) { System.out.println("exception happened - here's what I know: "); e.printStackTrace(); System.exit(-1); } }