public boolean equals(Object bw) { if (bw instanceof BigramWritable) { BigramWritable ow = (BigramWritable) bw; return leftBigram.equals(ow.leftBigram) && rightBigram.equals(ow.rightBigram); } return false; }
@Override public boolean equals(Object o) { if (o instanceof TextPair) { TextPair tp = (TextPair) o; return first.equals(tp.first) && second.equals(tp.second); } return false; }
/** Check whether the file list have duplication. */ private static void checkDuplication(FileSystem fs, Path file, Path sorted, Configuration conf) throws IOException { SequenceFile.Reader in = null; try { SequenceFile.Sorter sorter = new SequenceFile.Sorter(fs, new Text.Comparator(), Text.class, Text.class, conf); sorter.sort(file, sorted); in = new SequenceFile.Reader(fs, sorted, conf); Text prevdst = null, curdst = new Text(); Text prevsrc = null, cursrc = new Text(); for (; in.next(curdst, cursrc); ) { if (prevdst != null && curdst.equals(prevdst)) { throw new DuplicationException( "Invalid input, there are duplicated files in the sources: " + prevsrc + ", " + cursrc); } prevdst = curdst; curdst = new Text(); prevsrc = cursrc; cursrc = new Text(); } } finally { checkAndClose(in); } }
public static List<LogEntry> getLogEntries(Credentials credentials, KeyExtent extent) throws IOException, KeeperException, InterruptedException { log.info("Scanning logging entries for " + extent); ArrayList<LogEntry> result = new ArrayList<LogEntry>(); if (extent.equals(RootTable.EXTENT)) { log.info("Getting logs for root tablet from zookeeper"); getRootLogEntries(result); } else { log.info("Scanning metadata for logs used for tablet " + extent); Scanner scanner = getTabletLogScanner(credentials, extent); Text pattern = extent.getMetadataEntry(); for (Entry<Key, Value> entry : scanner) { Text row = entry.getKey().getRow(); if (entry.getKey().getColumnFamily().equals(LogColumnFamily.NAME)) { if (row.equals(pattern)) { result.add(LogEntry.fromKeyValue(entry.getKey(), entry.getValue())); } } } } Collections.sort( result, new Comparator<LogEntry>() { @Override public int compare(LogEntry o1, LogEntry o2) { long diff = o1.timestamp - o2.timestamp; if (diff < 0) return -1; if (diff > 0) return 1; return 0; } }); log.info("Returning logs " + result + " for extent " + extent); return result; }
public boolean equals(Object obj) { boolean isEqual = false; if (obj instanceof TextLong) { TextLong other = (TextLong) obj; isEqual = first.equals(other.first) && second.equals(other.second); } return isEqual; }
@Override public boolean equals(Object o) { if (o instanceof TermDocIdWritable) { TermDocIdWritable tuple = (TermDocIdWritable) o; return (__documentId.equals(tuple.getDocumentId()) && __term.equals(tuple.getTerm())); } return false; }
@Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; Pair other = (Pair) obj; if (element == null) { if (other.element != null) return false; } else if (!element.equals(other.element)) return false; if (neighbour == null) { if (other.neighbour != null) return false; } else if (!neighbour.equals(other.neighbour)) return false; return true; }
@Override public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { context.getCounter(MYCOUNTER.RECORD_COUNT).increment(1); if (value.toString().length() > 0) { String arrEmpAttributes[] = value.toString().split("\\t"); txtMapLookupKey.set(arrEmpAttributes[6].toString()); try { // txtMapLookupKey = deptNo // txtMapLookupValue = deptName deptMapReader.get(txtMapLookupKey, txtMapLookupValue); } finally { txtMapLookupValue.set( (txtMapLookupValue.equals(null) || txtMapLookupValue.equals("")) ? "NOT-FOUND" : txtMapLookupValue.toString()); } txtMapOutputKey.set(arrEmpAttributes[0].toString()); // empNo --> joinKey txtMapOutputValue.set( arrEmpAttributes[1].toString() + "\t" + arrEmpAttributes[1].toString() + "\t" + arrEmpAttributes[2].toString() + "\t" + arrEmpAttributes[3].toString() + "\t" + arrEmpAttributes[4].toString() + "\t" + arrEmpAttributes[5].toString() + "\t" + arrEmpAttributes[6].toString() + "\t" // deptNo + txtMapLookupValue.toString()); // deptName } context.write(txtMapOutputKey, txtMapOutputValue); txtMapLookupValue.set(""); txtMapLookupKey.set(""); }
@Override public Token<StramDelegationTokenIdentifier> selectToken( Text text, Collection<Token<? extends TokenIdentifier>> clctn) { Token<StramDelegationTokenIdentifier> token = null; if (text != null) { for (Token<? extends TokenIdentifier> ctoken : clctn) { if (StramDelegationTokenIdentifier.IDENTIFIER_KIND.equals(ctoken.getKind()) && text.equals(ctoken.getService())) { token = (Token<StramDelegationTokenIdentifier>) ctoken; } } } return token; }
@Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; KeyValueWritable other = (KeyValueWritable) obj; if (key == null) { if (other.key != null) return false; } else if (!key.equals(other.key)) return false; if (value == null) { if (other.value != null) return false; } else if (!value.equals(other.value)) return false; return true; }
private void cloneOutput() throws IOException { List<FileStatus> listStatus = getOutputMappings(); /* * Initialize to empty list, in which case swap() will be a no-op. The reference is then replaced with a real list, which is * used in the subsequent iterations. */ List<Path> crushInput = emptyList(); Text srcFile = new Text(); Text crushOut = new Text(); Text prevCrushOut = new Text(); for (FileStatus partFile : listStatus) { Path path = partFile.getPath(); Reader reader = new Reader(fs, path, fs.getConf()); try { while (reader.next(srcFile, crushOut)) { if (!crushOut.equals(prevCrushOut)) { swap(crushInput, prevCrushOut.toString()); prevCrushOut.set(crushOut); crushInput = new LinkedList<Path>(); } crushInput.add(new Path(srcFile.toString())); } } finally { try { reader.close(); } catch (IOException e) { LOG.warn("Trapped exception when closing " + path, e); } } swap(crushInput, prevCrushOut.toString()); } }
public void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { // System.out.println("BDM: In reduce."); int sum = 0; for (IntWritable val : values) { sum += val.get(); } if (key.equals(new Text("bytes"))) { context.write(new Text("bytes"), new IntWritable(sum)); } else { context.write(new Text("lines"), new IntWritable(sum)); } context.write(new Text("reducers"), new IntWritable(1)); }
@Override public boolean handleKind(Text kind) { return KIND.equals(kind); }
private ParseStatus output( Text key, CrawlDatum datum, Content content, ProtocolStatus pstatus, int status, int outlinkDepth) { datum.setStatus(status); datum.setFetchTime(System.currentTimeMillis()); if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus); ParseResult parseResult = null; if (content != null) { Metadata metadata = content.getMetadata(); // store the guessed content type in the crawldatum if (content.getContentType() != null) datum .getMetaData() .put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType())); // add segment to metadata metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName); // add score to content metadata so that ParseSegment can pick it up. try { scfilters.passScoreBeforeParsing(key, datum, content); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } /* * Note: Fetcher will only follow meta-redirects coming from the * original URL. */ if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) { if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) { try { parseResult = this.parseUtil.parse(content); } catch (Exception e) { LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e)); } } if (parseResult == null) { byte[] signature = SignatureFactory.getSignature(conf) .calculate(content, new ParseStatus().getEmptyParse(conf)); datum.setSignature(signature); } } /* * Store status code in content So we can read this value during parsing * (as a separate job) and decide to parse or not. */ content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status)); } try { output.collect(key, new NutchWritable(datum)); if (content != null && storingContent) output.collect(key, new NutchWritable(content)); if (parseResult != null) { for (Entry<Text, Parse> entry : parseResult) { Text url = entry.getKey(); Parse parse = entry.getValue(); ParseStatus parseStatus = parse.getData().getStatus(); ParseData parseData = parse.getData(); if (!parseStatus.isSuccess()) { LOG.warn("Error parsing: " + key + ": " + parseStatus); parse = parseStatus.getEmptyParse(conf); } // Calculate page signature. For non-parsing fetchers this will // be done in ParseSegment byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse); // Ensure segment name and score are in parseData metadata parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName); parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature)); // Pass fetch time to content meta parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime())); if (url.equals(key)) datum.setSignature(signature); try { scfilters.passScoreAfterParsing(url, content, parse); } catch (Exception e) { if (LOG.isWarnEnabled()) { LOG.warn("Couldn't pass score, url " + key + " (" + e + ")"); } } String origin = null; // collect outlinks for subsequent db update Outlink[] links = parseData.getOutlinks(); int outlinksToStore = Math.min(maxOutlinks, links.length); if (ignoreExternalLinks || ignoreInternalLinks) { URL originURL = new URL(url.toString()); // based on domain? if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) { origin = URLUtil.getDomainName(originURL).toLowerCase(); } // use host else { origin = originURL.getHost().toLowerCase(); } } // used by fetchNode if (fetchNode != null) { fetchNode.setOutlinks(links); fetchNode.setTitle(parseData.getTitle()); FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode); } int validCount = 0; // Process all outlinks, normalize, filter and deduplicate List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore); HashSet<String> outlinks = new HashSet<String>(outlinksToStore); for (int i = 0; i < links.length && validCount < outlinksToStore; i++) { String toUrl = links[i].getToUrl(); toUrl = ParseOutputFormat.filterNormalize( url.toString(), toUrl, origin, ignoreInternalLinks, ignoreExternalLinks, ignoreExternalLinksMode, urlFilters, urlExemptionFilters, normalizers); if (toUrl == null) { continue; } validCount++; links[i].setUrl(toUrl); outlinkList.add(links[i]); outlinks.add(toUrl); } // Only process depth N outlinks if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) { reporter.incrCounter("FetcherOutlinks", "outlinks_detected", outlinks.size()); // Counter to limit num outlinks to follow per page int outlinkCounter = 0; // Calculate variable number of outlinks by depth using the // divisor (outlinks = Math.floor(divisor / depth * num.links)) int maxOutlinksByDepth = (int) Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks); String followUrl; // Walk over the outlinks and add as new FetchItem to the queues Iterator<String> iter = outlinks.iterator(); while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) { followUrl = iter.next(); // Check whether we'll follow external outlinks if (outlinksIgnoreExternal) { if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) { continue; } } reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1); // Create new FetchItem with depth incremented FetchItem fit = FetchItem.create( new Text(followUrl), new CrawlDatum(CrawlDatum.STATUS_LINKED, interval), queueMode, outlinkDepth + 1); ((FetchItemQueues) fetchQueues).addFetchItem(fit); outlinkCounter++; } } // Overwrite the outlinks in ParseData with the normalized and // filtered set parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()])); output.collect( url, new NutchWritable( new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical()))); } } } catch (IOException e) { if (LOG.isErrorEnabled()) { LOG.error("fetcher caught:" + e.toString()); } } // return parse status if it exits if (parseResult != null && !parseResult.isEmpty()) { Parse p = parseResult.get(content.getUrl()); if (p != null) { reporter.incrCounter( "ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1); return p.getData().getStatus(); } } return null; }
@Test public void testHadoop20JHParser() throws Exception { // Disabled if (true) return; final Configuration conf = new Configuration(); final FileSystem lfs = FileSystem.getLocal(conf); boolean success = false; final Path rootInputDir = new Path(System.getProperty("test.tools.input.dir", "")).makeQualified(lfs); final Path rootTempDir = new Path(System.getProperty("test.build.data", "/tmp")).makeQualified(lfs); final Path rootInputPath = new Path(rootInputDir, "rumen/small-trace-test"); final Path tempDir = new Path(rootTempDir, "TestHadoop20JHParser"); lfs.delete(tempDir, true); final Path inputPath = new Path(rootInputPath, "v20-single-input-log.gz"); final Path goldPath = new Path(rootInputPath, "v20-single-input-log-event-classes.text.gz"); InputStream inputLogStream = new PossiblyDecompressedInputStream(inputPath, conf); InputStream inputGoldStream = new PossiblyDecompressedInputStream(goldPath, conf); BufferedInputStream bis = new BufferedInputStream(inputLogStream); bis.mark(10000); Hadoop20JHParser parser = new Hadoop20JHParser(bis); final Path resultPath = new Path(tempDir, "result.text"); System.out.println("testHadoop20JHParser sent its output to " + resultPath); Compressor compressor; FileSystem fs = resultPath.getFileSystem(conf); CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(resultPath); OutputStream output; if (codec != null) { compressor = CodecPool.getCompressor(codec); output = codec.createOutputStream(fs.create(resultPath), compressor); } else { output = fs.create(resultPath); } PrintStream printStream = new PrintStream(output); try { assertEquals( "Hadoop20JHParser can't parse the test file", true, Hadoop20JHParser.canParse(inputLogStream)); bis.reset(); HistoryEvent event = parser.nextEvent(); while (event != null) { printStream.println(event.getClass().getCanonicalName()); event = parser.nextEvent(); } printStream.close(); LineReader goldLines = new LineReader(inputGoldStream); LineReader resultLines = new LineReader(new PossiblyDecompressedInputStream(resultPath, conf)); int lineNumber = 1; try { Text goldLine = new Text(); Text resultLine = new Text(); int goldRead = goldLines.readLine(goldLine); int resultRead = resultLines.readLine(resultLine); while (goldRead * resultRead != 0) { if (!goldLine.equals(resultLine)) { assertEquals("Type mismatch detected", goldLine, resultLine); break; } goldRead = goldLines.readLine(goldLine); resultRead = resultLines.readLine(resultLine); ++lineNumber; } if (goldRead != resultRead) { assertEquals( "the " + (goldRead > resultRead ? "gold" : resultRead) + " file contains more text at line " + lineNumber, goldRead, resultRead); } success = true; } finally { goldLines.close(); resultLines.close(); if (success) { lfs.delete(resultPath, false); } } } finally { if (parser == null) { inputLogStream.close(); } else { if (parser != null) { parser.close(); } } if (inputGoldStream != null) { inputGoldStream.close(); } // it's okay to do this twice [if we get an error on input] printStream.close(); } }
public int getPartition(Text key, Text value, int numReduceTasks) { if (numReduceTasks == 0) return 0; if (key.equals(new Text("Cricket")) && !value.equals(new Text("India"))) return 0; if (key.equals(new Text("Cricket")) && value.equals(new Text("India"))) return 1; else return 2; }