public boolean equals(Object bw) {
   if (bw instanceof BigramWritable) {
     BigramWritable ow = (BigramWritable) bw;
     return leftBigram.equals(ow.leftBigram) && rightBigram.equals(ow.rightBigram);
   }
   return false;
 }
 @Override
 public boolean equals(Object o) {
   if (o instanceof TextPair) {
     TextPair tp = (TextPair) o;
     return first.equals(tp.first) && second.equals(tp.second);
   }
   return false;
 }
Exemple #3
0
  /** Check whether the file list have duplication. */
  private static void checkDuplication(FileSystem fs, Path file, Path sorted, Configuration conf)
      throws IOException {
    SequenceFile.Reader in = null;
    try {
      SequenceFile.Sorter sorter =
          new SequenceFile.Sorter(fs, new Text.Comparator(), Text.class, Text.class, conf);
      sorter.sort(file, sorted);
      in = new SequenceFile.Reader(fs, sorted, conf);

      Text prevdst = null, curdst = new Text();
      Text prevsrc = null, cursrc = new Text();
      for (; in.next(curdst, cursrc); ) {
        if (prevdst != null && curdst.equals(prevdst)) {
          throw new DuplicationException(
              "Invalid input, there are duplicated files in the sources: "
                  + prevsrc
                  + ", "
                  + cursrc);
        }
        prevdst = curdst;
        curdst = new Text();
        prevsrc = cursrc;
        cursrc = new Text();
      }
    } finally {
      checkAndClose(in);
    }
  }
  public static List<LogEntry> getLogEntries(Credentials credentials, KeyExtent extent)
      throws IOException, KeeperException, InterruptedException {
    log.info("Scanning logging entries for " + extent);
    ArrayList<LogEntry> result = new ArrayList<LogEntry>();
    if (extent.equals(RootTable.EXTENT)) {
      log.info("Getting logs for root tablet from zookeeper");
      getRootLogEntries(result);
    } else {
      log.info("Scanning metadata for logs used for tablet " + extent);
      Scanner scanner = getTabletLogScanner(credentials, extent);
      Text pattern = extent.getMetadataEntry();
      for (Entry<Key, Value> entry : scanner) {
        Text row = entry.getKey().getRow();
        if (entry.getKey().getColumnFamily().equals(LogColumnFamily.NAME)) {
          if (row.equals(pattern)) {
            result.add(LogEntry.fromKeyValue(entry.getKey(), entry.getValue()));
          }
        }
      }
    }

    Collections.sort(
        result,
        new Comparator<LogEntry>() {
          @Override
          public int compare(LogEntry o1, LogEntry o2) {
            long diff = o1.timestamp - o2.timestamp;
            if (diff < 0) return -1;
            if (diff > 0) return 1;
            return 0;
          }
        });
    log.info("Returning logs " + result + " for extent " + extent);
    return result;
  }
Exemple #5
0
  public boolean equals(Object obj) {
    boolean isEqual = false;
    if (obj instanceof TextLong) {
      TextLong other = (TextLong) obj;
      isEqual = first.equals(other.first) && second.equals(other.second);
    }

    return isEqual;
  }
  @Override
  public boolean equals(Object o) {
    if (o instanceof TermDocIdWritable) {
      TermDocIdWritable tuple = (TermDocIdWritable) o;
      return (__documentId.equals(tuple.getDocumentId()) && __term.equals(tuple.getTerm()));
    }

    return false;
  }
Exemple #7
0
  @Override
  public boolean equals(Object obj) {
    if (this == obj) return true;

    if (obj == null) return false;

    if (getClass() != obj.getClass()) return false;

    Pair other = (Pair) obj;
    if (element == null) {
      if (other.element != null) return false;
    } else if (!element.equals(other.element)) return false;

    if (neighbour == null) {
      if (other.neighbour != null) return false;
    } else if (!neighbour.equals(other.neighbour)) return false;

    return true;
  }
  @Override
  public void map(LongWritable key, Text value, Context context)
      throws IOException, InterruptedException {
    context.getCounter(MYCOUNTER.RECORD_COUNT).increment(1);

    if (value.toString().length() > 0) {
      String arrEmpAttributes[] = value.toString().split("\\t");
      txtMapLookupKey.set(arrEmpAttributes[6].toString());

      try {
        // txtMapLookupKey = deptNo
        // txtMapLookupValue = deptName
        deptMapReader.get(txtMapLookupKey, txtMapLookupValue);
      } finally {
        txtMapLookupValue.set(
            (txtMapLookupValue.equals(null) || txtMapLookupValue.equals(""))
                ? "NOT-FOUND"
                : txtMapLookupValue.toString());
      }

      txtMapOutputKey.set(arrEmpAttributes[0].toString()); // empNo --> joinKey
      txtMapOutputValue.set(
          arrEmpAttributes[1].toString()
              + "\t"
              + arrEmpAttributes[1].toString()
              + "\t"
              + arrEmpAttributes[2].toString()
              + "\t"
              + arrEmpAttributes[3].toString()
              + "\t"
              + arrEmpAttributes[4].toString()
              + "\t"
              + arrEmpAttributes[5].toString()
              + "\t"
              + arrEmpAttributes[6].toString()
              + "\t" // deptNo
              + txtMapLookupValue.toString()); // deptName
    }
    context.write(txtMapOutputKey, txtMapOutputValue);
    txtMapLookupValue.set("");
    txtMapLookupKey.set("");
  }
 @Override
 public Token<StramDelegationTokenIdentifier> selectToken(
     Text text, Collection<Token<? extends TokenIdentifier>> clctn) {
   Token<StramDelegationTokenIdentifier> token = null;
   if (text != null) {
     for (Token<? extends TokenIdentifier> ctoken : clctn) {
       if (StramDelegationTokenIdentifier.IDENTIFIER_KIND.equals(ctoken.getKind())
           && text.equals(ctoken.getService())) {
         token = (Token<StramDelegationTokenIdentifier>) ctoken;
       }
     }
   }
   return token;
 }
 @Override
 public boolean equals(Object obj) {
   if (this == obj) return true;
   if (obj == null) return false;
   if (getClass() != obj.getClass()) return false;
   KeyValueWritable other = (KeyValueWritable) obj;
   if (key == null) {
     if (other.key != null) return false;
   } else if (!key.equals(other.key)) return false;
   if (value == null) {
     if (other.value != null) return false;
   } else if (!value.equals(other.value)) return false;
   return true;
 }
Exemple #11
0
  private void cloneOutput() throws IOException {

    List<FileStatus> listStatus = getOutputMappings();

    /*
     * Initialize to empty list, in which case swap() will be a no-op. The reference is then replaced with a real list, which is
     * used in the subsequent iterations.
     */
    List<Path> crushInput = emptyList();

    Text srcFile = new Text();
    Text crushOut = new Text();
    Text prevCrushOut = new Text();

    for (FileStatus partFile : listStatus) {
      Path path = partFile.getPath();

      Reader reader = new Reader(fs, path, fs.getConf());

      try {
        while (reader.next(srcFile, crushOut)) {
          if (!crushOut.equals(prevCrushOut)) {
            swap(crushInput, prevCrushOut.toString());

            prevCrushOut.set(crushOut);
            crushInput = new LinkedList<Path>();
          }

          crushInput.add(new Path(srcFile.toString()));
        }
      } finally {
        try {
          reader.close();
        } catch (IOException e) {
          LOG.warn("Trapped exception when closing " + path, e);
        }
      }

      swap(crushInput, prevCrushOut.toString());
    }
  }
Exemple #12
0
    public void reduce(Text key, Iterable<IntWritable> values, Context context)
        throws IOException, InterruptedException {

      // System.out.println("BDM: In reduce.");

      int sum = 0;
      for (IntWritable val : values) {
        sum += val.get();
      }

      if (key.equals(new Text("bytes"))) {

        context.write(new Text("bytes"), new IntWritable(sum));

      } else {

        context.write(new Text("lines"), new IntWritable(sum));
      }

      context.write(new Text("reducers"), new IntWritable(1));
    }
 @Override
 public boolean handleKind(Text kind) {
   return KIND.equals(kind);
 }
Exemple #14
0
  private ParseStatus output(
      Text key,
      CrawlDatum datum,
      Content content,
      ProtocolStatus pstatus,
      int status,
      int outlinkDepth) {

    datum.setStatus(status);
    datum.setFetchTime(System.currentTimeMillis());
    if (pstatus != null) datum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, pstatus);

    ParseResult parseResult = null;
    if (content != null) {
      Metadata metadata = content.getMetadata();

      // store the guessed content type in the crawldatum
      if (content.getContentType() != null)
        datum
            .getMetaData()
            .put(new Text(Metadata.CONTENT_TYPE), new Text(content.getContentType()));

      // add segment to metadata
      metadata.set(Nutch.SEGMENT_NAME_KEY, segmentName);
      // add score to content metadata so that ParseSegment can pick it up.
      try {
        scfilters.passScoreBeforeParsing(key, datum, content);
      } catch (Exception e) {
        if (LOG.isWarnEnabled()) {
          LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
        }
      }
      /*
       * Note: Fetcher will only follow meta-redirects coming from the
       * original URL.
       */
      if (parsing && status == CrawlDatum.STATUS_FETCH_SUCCESS) {
        if (!skipTruncated || (skipTruncated && !ParseSegment.isTruncated(content))) {
          try {
            parseResult = this.parseUtil.parse(content);
          } catch (Exception e) {
            LOG.warn("Error parsing: " + key + ": " + StringUtils.stringifyException(e));
          }
        }

        if (parseResult == null) {
          byte[] signature =
              SignatureFactory.getSignature(conf)
                  .calculate(content, new ParseStatus().getEmptyParse(conf));
          datum.setSignature(signature);
        }
      }

      /*
       * Store status code in content So we can read this value during parsing
       * (as a separate job) and decide to parse or not.
       */
      content.getMetadata().add(Nutch.FETCH_STATUS_KEY, Integer.toString(status));
    }

    try {
      output.collect(key, new NutchWritable(datum));
      if (content != null && storingContent) output.collect(key, new NutchWritable(content));
      if (parseResult != null) {
        for (Entry<Text, Parse> entry : parseResult) {
          Text url = entry.getKey();
          Parse parse = entry.getValue();
          ParseStatus parseStatus = parse.getData().getStatus();
          ParseData parseData = parse.getData();

          if (!parseStatus.isSuccess()) {
            LOG.warn("Error parsing: " + key + ": " + parseStatus);
            parse = parseStatus.getEmptyParse(conf);
          }

          // Calculate page signature. For non-parsing fetchers this will
          // be done in ParseSegment
          byte[] signature = SignatureFactory.getSignature(conf).calculate(content, parse);
          // Ensure segment name and score are in parseData metadata
          parseData.getContentMeta().set(Nutch.SEGMENT_NAME_KEY, segmentName);
          parseData.getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(signature));
          // Pass fetch time to content meta
          parseData.getContentMeta().set(Nutch.FETCH_TIME_KEY, Long.toString(datum.getFetchTime()));
          if (url.equals(key)) datum.setSignature(signature);
          try {
            scfilters.passScoreAfterParsing(url, content, parse);
          } catch (Exception e) {
            if (LOG.isWarnEnabled()) {
              LOG.warn("Couldn't pass score, url " + key + " (" + e + ")");
            }
          }

          String origin = null;

          // collect outlinks for subsequent db update
          Outlink[] links = parseData.getOutlinks();
          int outlinksToStore = Math.min(maxOutlinks, links.length);
          if (ignoreExternalLinks || ignoreInternalLinks) {
            URL originURL = new URL(url.toString());
            // based on domain?
            if ("bydomain".equalsIgnoreCase(ignoreExternalLinksMode)) {
              origin = URLUtil.getDomainName(originURL).toLowerCase();
            }
            // use host
            else {
              origin = originURL.getHost().toLowerCase();
            }
          }

          // used by fetchNode
          if (fetchNode != null) {
            fetchNode.setOutlinks(links);
            fetchNode.setTitle(parseData.getTitle());
            FetchNodeDb.getInstance().put(fetchNode.getUrl().toString(), fetchNode);
          }
          int validCount = 0;

          // Process all outlinks, normalize, filter and deduplicate
          List<Outlink> outlinkList = new ArrayList<Outlink>(outlinksToStore);
          HashSet<String> outlinks = new HashSet<String>(outlinksToStore);
          for (int i = 0; i < links.length && validCount < outlinksToStore; i++) {
            String toUrl = links[i].getToUrl();

            toUrl =
                ParseOutputFormat.filterNormalize(
                    url.toString(),
                    toUrl,
                    origin,
                    ignoreInternalLinks,
                    ignoreExternalLinks,
                    ignoreExternalLinksMode,
                    urlFilters,
                    urlExemptionFilters,
                    normalizers);
            if (toUrl == null) {
              continue;
            }

            validCount++;
            links[i].setUrl(toUrl);
            outlinkList.add(links[i]);
            outlinks.add(toUrl);
          }

          // Only process depth N outlinks
          if (maxOutlinkDepth > 0 && outlinkDepth < maxOutlinkDepth) {
            reporter.incrCounter("FetcherOutlinks", "outlinks_detected", outlinks.size());

            // Counter to limit num outlinks to follow per page
            int outlinkCounter = 0;

            // Calculate variable number of outlinks by depth using the
            // divisor (outlinks = Math.floor(divisor / depth * num.links))
            int maxOutlinksByDepth =
                (int)
                    Math.floor(outlinksDepthDivisor / (outlinkDepth + 1) * maxOutlinkDepthNumLinks);

            String followUrl;

            // Walk over the outlinks and add as new FetchItem to the queues
            Iterator<String> iter = outlinks.iterator();
            while (iter.hasNext() && outlinkCounter < maxOutlinkDepthNumLinks) {
              followUrl = iter.next();

              // Check whether we'll follow external outlinks
              if (outlinksIgnoreExternal) {
                if (!URLUtil.getHost(url.toString()).equals(URLUtil.getHost(followUrl))) {
                  continue;
                }
              }

              reporter.incrCounter("FetcherOutlinks", "outlinks_following", 1);

              // Create new FetchItem with depth incremented
              FetchItem fit =
                  FetchItem.create(
                      new Text(followUrl),
                      new CrawlDatum(CrawlDatum.STATUS_LINKED, interval),
                      queueMode,
                      outlinkDepth + 1);
              ((FetchItemQueues) fetchQueues).addFetchItem(fit);

              outlinkCounter++;
            }
          }

          // Overwrite the outlinks in ParseData with the normalized and
          // filtered set
          parseData.setOutlinks(outlinkList.toArray(new Outlink[outlinkList.size()]));

          output.collect(
              url,
              new NutchWritable(
                  new ParseImpl(new ParseText(parse.getText()), parseData, parse.isCanonical())));
        }
      }
    } catch (IOException e) {
      if (LOG.isErrorEnabled()) {
        LOG.error("fetcher caught:" + e.toString());
      }
    }

    // return parse status if it exits
    if (parseResult != null && !parseResult.isEmpty()) {
      Parse p = parseResult.get(content.getUrl());
      if (p != null) {
        reporter.incrCounter(
            "ParserStatus", ParseStatus.majorCodes[p.getData().getStatus().getMajorCode()], 1);
        return p.getData().getStatus();
      }
    }
    return null;
  }
Exemple #15
0
  @Test
  public void testHadoop20JHParser() throws Exception {
    // Disabled
    if (true) return;

    final Configuration conf = new Configuration();
    final FileSystem lfs = FileSystem.getLocal(conf);

    boolean success = false;

    final Path rootInputDir =
        new Path(System.getProperty("test.tools.input.dir", "")).makeQualified(lfs);
    final Path rootTempDir =
        new Path(System.getProperty("test.build.data", "/tmp")).makeQualified(lfs);

    final Path rootInputPath = new Path(rootInputDir, "rumen/small-trace-test");
    final Path tempDir = new Path(rootTempDir, "TestHadoop20JHParser");
    lfs.delete(tempDir, true);

    final Path inputPath = new Path(rootInputPath, "v20-single-input-log.gz");
    final Path goldPath = new Path(rootInputPath, "v20-single-input-log-event-classes.text.gz");

    InputStream inputLogStream = new PossiblyDecompressedInputStream(inputPath, conf);

    InputStream inputGoldStream = new PossiblyDecompressedInputStream(goldPath, conf);

    BufferedInputStream bis = new BufferedInputStream(inputLogStream);
    bis.mark(10000);
    Hadoop20JHParser parser = new Hadoop20JHParser(bis);

    final Path resultPath = new Path(tempDir, "result.text");

    System.out.println("testHadoop20JHParser sent its output to " + resultPath);

    Compressor compressor;

    FileSystem fs = resultPath.getFileSystem(conf);
    CompressionCodec codec = new CompressionCodecFactory(conf).getCodec(resultPath);
    OutputStream output;
    if (codec != null) {
      compressor = CodecPool.getCompressor(codec);
      output = codec.createOutputStream(fs.create(resultPath), compressor);
    } else {
      output = fs.create(resultPath);
    }

    PrintStream printStream = new PrintStream(output);

    try {
      assertEquals(
          "Hadoop20JHParser can't parse the test file",
          true,
          Hadoop20JHParser.canParse(inputLogStream));

      bis.reset();

      HistoryEvent event = parser.nextEvent();

      while (event != null) {
        printStream.println(event.getClass().getCanonicalName());
        event = parser.nextEvent();
      }

      printStream.close();

      LineReader goldLines = new LineReader(inputGoldStream);
      LineReader resultLines =
          new LineReader(new PossiblyDecompressedInputStream(resultPath, conf));

      int lineNumber = 1;

      try {
        Text goldLine = new Text();
        Text resultLine = new Text();

        int goldRead = goldLines.readLine(goldLine);
        int resultRead = resultLines.readLine(resultLine);

        while (goldRead * resultRead != 0) {
          if (!goldLine.equals(resultLine)) {
            assertEquals("Type mismatch detected", goldLine, resultLine);
            break;
          }

          goldRead = goldLines.readLine(goldLine);
          resultRead = resultLines.readLine(resultLine);

          ++lineNumber;
        }

        if (goldRead != resultRead) {
          assertEquals(
              "the "
                  + (goldRead > resultRead ? "gold" : resultRead)
                  + " file contains more text at line "
                  + lineNumber,
              goldRead,
              resultRead);
        }

        success = true;
      } finally {
        goldLines.close();
        resultLines.close();

        if (success) {
          lfs.delete(resultPath, false);
        }
      }

    } finally {
      if (parser == null) {
        inputLogStream.close();
      } else {
        if (parser != null) {
          parser.close();
        }
      }

      if (inputGoldStream != null) {
        inputGoldStream.close();
      }

      // it's okay to do this twice [if we get an error on input]
      printStream.close();
    }
  }
 public int getPartition(Text key, Text value, int numReduceTasks) {
   if (numReduceTasks == 0) return 0;
   if (key.equals(new Text("Cricket")) && !value.equals(new Text("India"))) return 0;
   if (key.equals(new Text("Cricket")) && value.equals(new Text("India"))) return 1;
   else return 2;
 }