Java TextBytes.set 예제들

프로그래밍 언어: Java

네임스페이스/패키지 이름: org.commoncrawl.util

클래스/타입: TextBytes

메소드/함수: set

hotexamples.com에서의 예제들: 4

Java TextBytes.set - 4개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Java의 org.commoncrawl.util.TextBytes.set에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

toString(10)

set(4)

getLength(2)

clear(1)

compareTo(1)

getBytes(1)

getOffset(1)

예제 #1

파일 보기

파일: CrawlDBMergingReducerTests.java 프로젝트: bwahn/commoncrawl-crawler

    void updateModelGivenUrlsSampleRecord(TextBytes inputData) {
      int curpos = inputData.getOffset();
      int endpos = inputData.getOffset() + inputData.getLength();

      byte lfPattern[] = {0xA};
      byte tabPattern[] = {0x9};

      TextBytes urlText = new TextBytes();

      while (curpos != endpos) {
        int tabIndex =
            ByteArrayUtils.indexOf(inputData.getBytes(), curpos, endpos - curpos, tabPattern);
        if (tabIndex == -1) {
          break;
        } else {
          int lfIndex =
              ByteArrayUtils.indexOf(
                  inputData.getBytes(), tabIndex + 1, endpos - (tabIndex + 1), lfPattern);
          if (lfIndex == -1) {
            break;
          } else {
            long sourceDomainHash =
                ByteArrayUtils.parseLong(inputData.getBytes(), curpos, tabIndex - curpos, 10);
            urlText.set(inputData.getBytes(), tabIndex + 1, lfIndex - (tabIndex + 1));
            incoming.put(sourceDomainHash, urlText.toString());
            curpos = lfIndex + 1;
          }
        }
      }
    }

예제 #2

파일 보기

파일: GenBundlesStep.java 프로젝트: arefugee/commoncrawl-crawler

 private static void rawValueToTextBytes(
     DataOutputBuffer dataBuffer, DataInputBuffer inputBuffer, TextBytes textOut)
     throws IOException {
   inputBuffer.reset(dataBuffer.getData(), dataBuffer.getLength());
   int newLength = WritableUtils.readVInt(inputBuffer);
   textOut.set(inputBuffer.getData(), inputBuffer.getPosition(), newLength);
 }

예제 #3

파일 보기

파일: CrawlDBMergingReducerTests.java 프로젝트: bwahn/commoncrawl-crawler

  @Test
  public void testSourceInputOutputWriters() throws IOException {
    _sourceInputsBuffer = new DataOutputBuffer(16348 * 4);
    _sourceInputsTrackingFilter = new URLFPBloomFilter(100000, NUM_HASH_FUNCTIONS, NUM_BITS);

    String sourceDomainURL = "http://sourcedomain.com/foo";
    URLFPV2 sourceFP = URLUtils.getURLFPV2FromCanonicalURL(sourceDomainURL);

    String urls[] = {"http://somedomain.com/foo", "http://someother.com/bar"};

    for (String url : urls) {
      URLFPV2 fp = URLUtils.getURLFPV2FromCanonicalURL(url);
      // double insert and validate actual single insertion
      trackPotentialLinkSource(fp, url, sourceFP);
      trackPotentialLinkSource(fp, url, sourceFP);
    }

    //  validate data ...
    TextBytes firstVersion = new TextBytes();
    firstVersion.set(_sourceInputsBuffer.getData(), 0, _sourceInputsBuffer.getLength());

    StringTokenizer tokenizer = new StringTokenizer(firstVersion.toString(), "\n");
    int itemIndex = 0;
    while (tokenizer.hasMoreElements()) {
      String nextLine = tokenizer.nextToken();
      String splits[] = nextLine.split("\t");
      // validate fp
      URLFPV2 fp = URLUtils.getURLFPV2FromCanonicalURL(urls[itemIndex]);
      Assert.assertEquals(fp.getDomainHash(), Long.parseLong(splits[0]));
      // validate actual url ...
      Assert.assertEquals(splits[1], urls[itemIndex]);
      itemIndex++;
    }

    // reset output buffer ...
    _sourceInputsBuffer = new DataOutputBuffer(16348 * 4);
    // and source bloom filter ...
    _sourceInputsTrackingFilter = new URLFPBloomFilter(10000000, NUM_HASH_FUNCTIONS, NUM_BITS);
    importLinkSourceData(sourceFP, firstVersion);
    // second text should match first ..
    TextBytes secondVersion = new TextBytes();
    secondVersion.set(_sourceInputsBuffer.getData(), 0, _sourceInputsBuffer.getLength());
    Assert.assertEquals(firstVersion, secondVersion);
  }

예제 #4

파일 보기

파일: GenBundlesStep.java 프로젝트: arefugee/commoncrawl-crawler

  void iterateItems(MultiFileInputReader<TextBytes> multiFileInputReader, Reporter reporter)
      throws IOException {

    Pair<KeyAndValueData<TextBytes>, Iterable<RawRecordValue>> nextItem = null;

    int iterationCount = 0;

    while ((nextItem = multiFileInputReader.getNextItemIterator()) != null) {

      reporter.incrCounter(Counters.GOT_RECORD, 1);

      int type = PartitionUtils.getTypeGivenPartitionKey(nextItem.e0._keyObject);
      PartitionUtils.getDomainGivenPartitionKey(nextItem.e0._keyObject, _newDomainBytes);
      PartitionUtils.getURLGivenPartitionKey(nextItem.e0._keyObject, _newURLBytes);

      if (_newURLBytes.compareTo(_contextURLBytes) != 0) {
        emitLastRecord(reporter);
      }

      long newDomainFP = SuperDomainList.domainFingerprintGivenName(_newDomainBytes.toString());

      if (newDomainFP != _currentDomainId) {
        reporter.incrCounter(Counters.TRANSITIONING_DOMAIN, 1);
        domainTransition(newDomainFP, _newDomainBytes.toString(), reporter);
      }

      RawRecordValue valueRaw = Iterables.getFirst(nextItem.e1, null);

      switch (type) {
        case CrawlListGeneratorTask.KEY_TYPE_CRAWLSTATS:
          {
            reporter.incrCounter(Counters.GOT_CRAWLSTATS, 1);
            setDomainStats(
                rawValueToJsonObject(valueRaw.data, tempBuffer, tempTextBuffer), reporter);
          }
          break;

        case CrawlListGeneratorTask.KEY_TYPE_HOMEPAGE_URL:
          {
            reporter.incrCounter(Counters.GOT_HOMEPAGE_DATA, 1);
            rawValueToTextBytes(valueRaw.data, tempBuffer, _contextURLBytes);
            _flags |= HAS_HOMEPAGE_URLDATA;
          }
          break;

        case CrawlListGeneratorTask.KEY_TYPE_BLOGPROBE_URL:
          {
            reporter.incrCounter(Counters.GOT_BLOGPROBE_DATA, 1);
            rawValueToWritable(valueRaw, tempBuffer, _blogURLSkipFlag);
            _contextURLBytes.set(_newURLBytes);
            _flags |= HAS_BLOGPROBE_URLDATA;
          }
          break;

        case CrawlListGeneratorTask.KEY_TYPE_FEED_URL:
          {
            reporter.incrCounter(Counters.GOT_FEEDURL_DATA, 1);
            rawValueToTextBytes(valueRaw.data, tempBuffer, _contextURLBytes);
            _flags |= HAS_FEED_URLDATA;
          }
          break;

        case CrawlListGeneratorTask.KEY_TYPE_REDIRECT_RECORD:
          {
            reporter.incrCounter(Counters.GOT_REDIRECT_DATA, 1);
            _contextURLBytes.set(_newURLBytes);
            _flags |= HAS_REDIRECT_DATA;
          }
          break;

        case CrawlListGeneratorTask.KEY_TYPE_CRAWLDATA:
          {
            reporter.incrCounter(Counters.GOT_CRAWLURL_DATA, 1);
            _contextURLBytes.set(_newURLBytes);
            _crawlStatus = rawValueToJsonObject(valueRaw.data, tempBuffer, tempTextBuffer);
            _flags |= HAS_CRAWL_STATUS;
          }
          break;
      }
    }
    // flush trailing record ...
    emitLastRecord(reporter);
    flushDomain(reporter);
  }