Пример #1
0
  public synchronized String processNative(BehemothDocument inputDoc, Reporter reporter) {
    if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString());
    // process the text passed as value with the application
    // a) create a GATE document based on the text value
    gate.Document gatedocument = null;
    try {

      gatedocument = generateGATEDoc(inputDoc);
      // add it to the current corpus
      corpus.add(gatedocument);
      // get the application and assign the corpus to it
      this.GATEapplication.setCorpus(corpus);
      // process it with GATE
      this.GATEapplication.execute();

      // transfer the annotations from the GATE document
      // to the Behemoth one using the filters
      if (reporter != null) reporter.incrCounter("GATE", "Document", 1);

      return gatedocument.toXml();

    } catch (Exception e) {
      LOG.error(inputDoc.getUrl().toString(), e);
      if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1);
    } finally {
      // remove the document from the corpus again
      corpus.clear();
      // and from memory
      if (gatedocument != null) Factory.deleteResource(gatedocument);
    }
    return null;
  }
Пример #2
0
    /**
     * Map method.
     *
     * @param offset samples starting from the (offset+1)th sample.
     * @param size the number of samples for this map
     * @param out output {ture->numInside, false->numOutside}
     * @param reporter
     */
    public void map(
        LongWritable offset,
        LongWritable size,
        OutputCollector<BooleanWritable, LongWritable> out,
        Reporter reporter)
        throws IOException {

      final HaltonSequence haltonsequence = new HaltonSequence(offset.get());
      long numInside = 0L;
      long numOutside = 0L;

      for (long i = 0; i < size.get(); ) {
        // generate points in a unit square
        final double[] point = haltonsequence.nextPoint();

        // count points inside/outside of the inscribed circle of the square
        final double x = point[0] - 0.5;
        final double y = point[1] - 0.5;
        if (x * x + y * y > 0.25) {
          numOutside++;
        } else {
          numInside++;
        }

        // report status
        i++;
        if (i % 1000 == 0) {
          reporter.setStatus("Generated " + i + " samples.");
        }
      }

      // output map results
      out.collect(new BooleanWritable(true), new LongWritable(numInside));
      out.collect(new BooleanWritable(false), new LongWritable(numOutside));
    }
Пример #3
0
    /** Run a FileOperation */
    public void map(
        Text key,
        PolicyInfo policy,
        OutputCollector<WritableComparable, Text> out,
        Reporter reporter)
        throws IOException {
      this.reporter = reporter;
      try {
        LOG.info("Raiding file=" + key.toString() + " policy=" + policy);
        Path p = new Path(key.toString());
        FileStatus fs = p.getFileSystem(jobconf).getFileStatus(p);
        st.clear();
        RaidNode.doRaid(jobconf, policy, fs, st, reporter);

        ++succeedcount;

        reporter.incrCounter(Counter.PROCESSED_BLOCKS, st.numProcessedBlocks);
        reporter.incrCounter(Counter.PROCESSED_SIZE, st.processedSize);
        reporter.incrCounter(Counter.META_BLOCKS, st.numMetaBlocks);
        reporter.incrCounter(Counter.META_SIZE, st.metaSize);

        reporter.incrCounter(Counter.FILES_SUCCEEDED, 1);
      } catch (IOException e) {
        ++failcount;
        reporter.incrCounter(Counter.FILES_FAILED, 1);

        String s = "FAIL: " + policy + ", " + key + " " + StringUtils.stringifyException(e);
        out.collect(null, new Text(s));
        LOG.info(s);
      } finally {
        reporter.setStatus(getCountString());
      }
    }
Пример #4
0
 /**
  * This is the function that re-groups values for a key into sub-groups based on a secondary key
  * (input tag).
  *
  * @param arg1
  * @return
  */
 private SortedMap<Object, ResetableIterator> regroup(Object key, Iterator arg1, Reporter reporter)
     throws IOException {
   this.numOfValues = 0;
   SortedMap<Object, ResetableIterator> retv = new TreeMap<Object, ResetableIterator>();
   IntermediateData aRecord = null;
   while (arg1.hasNext()) {
     this.numOfValues += 1;
     // make log while processing
     if (this.numOfValues % 100 == 0) {
       reporter.setStatus("key: " + key.toString() + " numOfValues: " + this.numOfValues);
     }
     // skip out when exccess limit
     if (this.numOfValues > this.maxNumOfValuesPerGroup) {
       break;
     }
     aRecord = ((IntermediateData) arg1.next()).clone(job);
     Text tag = aRecord.getTag();
     ResetableIterator data = retv.get(tag);
     if (data == null) {
       data = createResetableIterator();
       retv.put(tag, data);
     }
     data.add(aRecord);
   }
   // LOG.info("EXIT while");
   if (this.numOfValues > this.largestNumOfValues) {
     this.largestNumOfValues = numOfValues;
     LOG.info("key: " + key.toString() + " this.largestNumOfValues: " + this.largestNumOfValues);
   }
   return retv;
 }
    /**
     * {@inheritDoc}
     *
     * @see org.apache.hadoop.mapred.Mapper#map(java.lang.Object, java.lang.Object,
     *     org.apache.hadoop.mapred.OutputCollector, org.apache.hadoop.mapred.Reporter)
     */
    @Override
    public void map(
        LongWritable key, Text value, OutputCollector<Text, IntWritable> output, Reporter reporter)
        throws IOException {
      String line = m_caseSensitive ? value.toString() : value.toString().toLowerCase();

      for (String pattern : m_patternsToSkip) {
        line = line.replaceAll(pattern, "");
      }

      StringTokenizer tokenizer = new StringTokenizer(line);
      while (tokenizer.hasMoreTokens()) {
        m_word.set(tokenizer.nextToken());
        output.collect(m_word, ONE);
        reporter.incrCounter(Counters.INPUT_WORDS, 1);
      }

      if ((++m_numRecords % 100) == 0) {
        reporter.setStatus(
            "Finished processing "
                + m_numRecords
                + " records "
                + "from the input file: "
                + m_inputFile);
      }
    }
    public void map(
        Text key,
        LongWritable value,
        OutputCollector<Text, LongWritable> collector,
        Reporter reporter)
        throws IOException {

      String name = key.toString();
      long size = value.get();
      long seed = Long.parseLong(name);

      random.setSeed(seed);
      reporter.setStatus("opening " + name);

      DataInputStream in = new DataInputStream(fs.open(new Path(DATA_DIR, name)));

      long read = 0;
      try {
        while (read < size) {
          long remains = size - read;
          int n = (remains <= buffer.length) ? (int) remains : buffer.length;
          in.readFully(buffer, 0, n);
          read += n;
          if (fastCheck) {
            Arrays.fill(check, (byte) random.nextInt(Byte.MAX_VALUE));
          } else {
            random.nextBytes(check);
          }
          if (n != buffer.length) {
            Arrays.fill(buffer, n, buffer.length, (byte) 0);
            Arrays.fill(check, n, check.length, (byte) 0);
          }
          assertTrue(Arrays.equals(buffer, check));

          reporter.setStatus("reading " + name + "@" + read + "/" + size);
        }
      } finally {
        in.close();
      }

      collector.collect(new Text("bytes"), new LongWritable(read));

      reporter.setStatus("read " + name);
    }
    public void map(
        Text key, LongWritable value, OutputCollector<K, LongWritable> collector, Reporter reporter)
        throws IOException {
      String name = key.toString();
      long size = value.get();
      long seed = Long.parseLong(name);

      if (size == 0) return;

      reporter.setStatus("opening " + name);

      FSDataInputStream in = fs.open(new Path(DATA_DIR, name));

      try {
        for (int i = 0; i < SEEKS_PER_FILE; i++) {
          // generate a random position
          long position = Math.abs(random.nextLong()) % size;

          // seek file to that position
          reporter.setStatus("seeking " + name);
          in.seek(position);
          byte b = in.readByte();

          // check that byte matches
          byte checkByte = 0;
          // advance random state to that position
          random.setSeed(seed);
          for (int p = 0; p <= position; p += check.length) {
            reporter.setStatus("generating data for " + name);
            if (fastCheck) {
              checkByte = (byte) random.nextInt(Byte.MAX_VALUE);
            } else {
              random.nextBytes(check);
              checkByte = check[(int) (position % check.length)];
            }
          }
          assertEquals(b, checkByte);
        }
      } finally {
        in.close();
      }
    }
    public void map(
        Text key,
        LongWritable value,
        OutputCollector<Text, LongWritable> collector,
        Reporter reporter)
        throws IOException {

      String name = key.toString();
      long size = value.get();
      long seed = Long.parseLong(name);

      random.setSeed(seed);
      reporter.setStatus("creating " + name);

      // write to temp file initially to permit parallel execution
      Path tempFile = new Path(DATA_DIR, name + suffix);
      OutputStream out = fs.create(tempFile);

      long written = 0;
      try {
        while (written < size) {
          if (fastCheck) {
            Arrays.fill(buffer, (byte) random.nextInt(Byte.MAX_VALUE));
          } else {
            random.nextBytes(buffer);
          }
          long remains = size - written;
          int length = (remains <= buffer.length) ? (int) remains : buffer.length;
          out.write(buffer, 0, length);
          written += length;
          reporter.setStatus("writing " + name + "@" + written + "/" + size);
        }
      } finally {
        out.close();
      }
      // rename to final location
      fs.rename(tempFile, new Path(DATA_DIR, name));

      collector.collect(new Text("bytes"), new LongWritable(written));

      reporter.setStatus("wrote " + name);
    }
Пример #9
0
 /**
  * The subclass can overwrite this method to perform additional filtering and/or other processing
  * logic before a value is collected.
  *
  * @param key
  * @param aRecord
  * @param output
  * @param reporter
  * @throws IOException
  */
 protected void collect(
     Object key, IntermediateData aRecord, OutputCollector output, Reporter reporter)
     throws IOException {
   this.collected += 1;
   addLongValue("collectedCount", 1);
   if (aRecord != null) {
     output.collect(key, aRecord.getData());
     reporter.setStatus("key: " + key.toString() + " collected: " + collected);
     addLongValue("actuallyCollectedCount", 1);
   }
 }
Пример #10
0
 @Override
 public void reduce(
     Text key, Iterator<Text> iter, OutputCollector<Text, Text> oc, Reporter reporter)
     throws IOException {
   HashSet<Text> hash = new HashSet<Text>();
   while (iter.hasNext()) {
     hash.add(iter.next());
   }
   for (Text t : hash) oc.collect(key, t);
   reporter.setStatus("OK");
 }
Пример #11
0
 @Override
 public void reduce(
     Text key, Iterator<Text> iter, OutputCollector<Text, Text> oc, Reporter reporter)
     throws IOException {
   HashSet<Text> hash = new HashSet<Text>();
   while (iter.hasNext()) {
     hash.add(iter.next());
   }
   oc.collect(key, new Text(Integer.toString(hash.size())));
   reporter.setStatus("OK");
 }
    public RecordReader<Text, Text> getRecordReader(
        InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException {

      reporter.setStatus(genericSplit.toString());
      FileSplit split = (FileSplit) genericSplit;
      final Path file = split.getPath();
      FileSystem fs = file.getFileSystem(job);
      FSDataInputStream fileIn = fs.open(split.getPath());
      if (compressionCodecs != null && compressionCodecs.getCodec(file) != null)
        throw new RuntimeException("Not handling compression!");

      return new StreamXmlRecordReader(fileIn, split, reporter, job, FileSystem.get(job));
    }
 @Override // IOMapperBase
 public Long doIO(Reporter reporter, String name, long totalSize // in bytes
     ) throws IOException {
   InputStream in = (InputStream) this.stream;
   long actualSize = 0;
   while (actualSize < totalSize) {
     int curSize = in.read(buffer, 0, bufferSize);
     if (curSize < 0) break;
     actualSize += curSize;
     reporter.setStatus(
         "reading " + name + "@" + actualSize + "/" + totalSize + " ::host = " + hostName);
   }
   return Long.valueOf(actualSize);
 }
 @Override // IOMapperBase
 public Long doIO(Reporter reporter, String name, long totalSize // in bytes
     ) throws IOException {
   PositionedReadable in = (PositionedReadable) this.stream;
   long actualSize = 0;
   for (long pos = nextOffset(-1); actualSize < totalSize; pos = nextOffset(pos)) {
     int curSize = in.read(pos, buffer, 0, bufferSize);
     if (curSize < 0) break;
     actualSize += curSize;
     reporter.setStatus(
         "reading " + name + "@" + actualSize + "/" + totalSize + " ::host = " + hostName);
   }
   return Long.valueOf(actualSize);
 }
 @SuppressWarnings("unchecked")
 @Override
 /**
  * Instantiates a FileCollectionRecordReader using the specified spit (which is assumed to be a
  * CombineFileSplit.
  *
  * @param genericSplit contains files to be processed, assumed to be a CombineFileSplit
  * @param job JobConf of this job
  * @param reported To report progress
  */
 public RecordReader<Text, SplitAwareWrapper<Document>> getRecordReader(
     InputSplit genericSplit, JobConf job, Reporter reporter) throws IOException {
   reporter.setStatus(genericSplit.toString());
   return new FileCollectionRecordReader(job, (PositionAwareSplit<CombineFileSplit>) genericSplit);
 }
Пример #16
0
 /** Given an output filename, write a bunch of random records to it. */
 public void map(
     WritableComparable key,
     Writable value,
     OutputCollector<BytesWritable, BytesWritable> output,
     Reporter reporter)
     throws IOException {
   int itemCount = 0;
   while (numBytesToWrite > 0) {
     int keyLength = minKeySize + (keySizeRange != 0 ? random.nextInt(keySizeRange) : 0);
     randomKey.setSize(keyLength);
     randomizeBytes(randomKey.getBytes(), 0, randomKey.getLength());
     int valueLength = minValueSize + (valueSizeRange != 0 ? random.nextInt(valueSizeRange) : 0);
     randomValue.setSize(valueLength);
     randomizeBytes(randomValue.getBytes(), 0, randomValue.getLength());
     output.collect(randomKey, randomValue);
     numBytesToWrite -= keyLength + valueLength;
     reporter.incrCounter(Counters.BYTES_WRITTEN, keyLength + valueLength);
     reporter.incrCounter(Counters.RECORDS_WRITTEN, 1);
     if (++itemCount % 200 == 0) {
       reporter.setStatus("wrote record " + itemCount + ". " + numBytesToWrite + " bytes left.");
     }
   }
   reporter.setStatus("done with " + itemCount + " records.");
 }
Пример #17
0
    @Override
    public RecordReader getRecordReader(InputSplit split, JobConf job, Reporter reporter) {

      //      try{ compatible with hadoop-0.14 TODO MC
      reporter.setStatus(split.toString());
      /*
           } catch (IOException e) {
             throw new RuntimeException("Cannot set status for reported:", e);
           }
      */
      // find part name
      SegmentPart segmentPart;
      final String spString;
      try {
        segmentPart = SegmentPart.get((FileSplit) split);
        spString = segmentPart.toString();
      } catch (IOException e) {
        throw new RuntimeException("Cannot identify segment:", e);
      }

      try {
        return new SequenceFileRecordReader(job, (FileSplit) split) {

          @Override
          public synchronized boolean next(Writable key, Writable value) throws IOException {
            LOG.debug("Running OIF.next()");

            MetaWrapper wrapper = (MetaWrapper) value;
            try {
              wrapper.set(getValueClass().newInstance());
            } catch (Exception e) {
              throw new IOException(e.toString());
            }

            boolean res = super.next(key, (Writable) wrapper.get());
            wrapper.setMeta(SEGMENT_PART_KEY, spString);
            return res;
          }

          @Override
          public Writable createValue() {
            return new MetaWrapper();
          }
        };
      } catch (IOException e) {
        throw new RuntimeException("Cannot create RecordReader: ", e);
      }
    }
 @Override // IOMapperBase
 public Long doIO(Reporter reporter, String name, long totalSize // in bytes
     ) throws IOException {
   OutputStream out = (OutputStream) this.stream;
   // write to the file
   long nrRemaining;
   for (nrRemaining = totalSize; nrRemaining > 0; nrRemaining -= bufferSize) {
     int curSize = (bufferSize < nrRemaining) ? bufferSize : (int) nrRemaining;
     out.write(buffer, 0, curSize);
     reporter.setStatus(
         "writing "
             + name
             + "@"
             + (totalSize - nrRemaining)
             + "/"
             + totalSize
             + " ::host = "
             + hostName);
   }
   return Long.valueOf(totalSize);
 }
  private boolean validate(String str, Reporter reporter) {
    String[] parts = str.split("\t");

    if (parts.length != 6) {
      if (parts.length < 6) {
        reporter.incrCounter(LineCounters.TOO_FEW_TABS, 1);
      } else {
        reporter.incrCounter(LineCounters.TOO_MANY_TABS, 1);
      }

      reporter.incrCounter(LineCounters.BAD_LINES, 1);

      if ((reporter.getCounter(LineCounters.BAD_LINES).getCounter() % 10) == 0) {
        reporter.setStatus("Got 10 bad lines.");
        System.err.println("Read another 10 bad lines.");
      }

      return false;
    }

    return true;
  }
Пример #20
0
    public Long doIO(Reporter reporter, String name, long totalSize) throws IOException {
      totalSize *= MEGA;

      // create instance of local filesystem
      FileSystem localFS = FileSystem.getLocal(fsConfig);

      try {
        // native runtime
        Runtime runTime = Runtime.getRuntime();

        // copy the dso and executable from dfs
        synchronized (this) {
          localFS.delete(HDFS_TEST_DIR, true);
          if (!(localFS.mkdirs(HDFS_TEST_DIR))) {
            throw new IOException("Failed to create " + HDFS_TEST_DIR + " on local filesystem");
          }
        }

        synchronized (this) {
          if (!localFS.exists(HDFS_SHLIB)) {
            if (!FileUtil.copy(fs, HDFS_SHLIB, localFS, HDFS_SHLIB, false, fsConfig)) {
              throw new IOException("Failed to copy " + HDFS_SHLIB + " to local filesystem");
            }

            String chmodCmd = new String(CHMOD + " a+x " + HDFS_SHLIB);
            Process process = runTime.exec(chmodCmd);
            int exitStatus = process.waitFor();
            if (exitStatus != 0) {
              throw new IOException(chmodCmd + ": Failed with exitStatus: " + exitStatus);
            }
          }
        }

        synchronized (this) {
          if (!localFS.exists(HDFS_READ)) {
            if (!FileUtil.copy(fs, HDFS_READ, localFS, HDFS_READ, false, fsConfig)) {
              throw new IOException("Failed to copy " + HDFS_READ + " to local filesystem");
            }

            String chmodCmd = new String(CHMOD + " a+x " + HDFS_READ);
            Process process = runTime.exec(chmodCmd);
            int exitStatus = process.waitFor();

            if (exitStatus != 0) {
              throw new IOException(chmodCmd + ": Failed with exitStatus: " + exitStatus);
            }
          }
        }

        // exec the C program
        Path inFile = new Path(DATA_DIR, name);
        String readCmd = new String(HDFS_READ + " " + inFile + " " + totalSize + " " + bufferSize);
        Process process = runTime.exec(readCmd, null, new File(HDFS_TEST_DIR.toString()));
        int exitStatus = process.waitFor();

        if (exitStatus != 0) {
          throw new IOException(HDFS_READ + ": Failed with exitStatus: " + exitStatus);
        }
      } catch (InterruptedException interruptedException) {
        reporter.setStatus(interruptedException.toString());
      } finally {
        localFS.close();
      }
      return new Long(totalSize);
    }
Пример #21
0
  protected void sequenceCrush(FileSystem fs, FileStatus[] status)
      throws IOException, CrushException {
    l4j.info("Sequence file crushing activated");
    Class keyClass = null;
    Class valueClass = null;
    SequenceFile.Writer writer = null;
    for (FileStatus stat : status) {
      if (reporter != null) {
        reporter.setStatus("Crushing on " + stat.getPath());
        l4j.info("Current file " + stat.getPath());
        l4j.info("length " + stat.getLen());
        reporter.incrCounter(CrushMapper.CrushCounters.FILES_CRUSHED, 1);
      }
      Path p1 = stat.getPath();
      SequenceFile.Reader read = new SequenceFile.Reader(fs, p1, jobConf);
      if (keyClass == null) {
        keyClass = read.getKeyClass();
        valueClass = read.getValueClass();
        writer =
            SequenceFile.createWriter(
                fs, jobConf, outPath, keyClass, valueClass, this.compressionType, this.codec);
      } else {
        if (!(keyClass.equals(read.getKeyClass()) && valueClass.equals(read.getValueClass()))) {
          read.close();
          writer.close();
          throw new CrushException(
              "File  "
                  + stat.getPath()
                  + " keyClass "
                  + read.getKeyClass()
                  + " valueClass "
                  + read.getValueClassName()
                  + " does not match"
                  + " other files in folder");
        }
      }

      Writable k = (Writable) ReflectionUtils.newInstance(keyClass, jobConf);
      Writable v = (Writable) ReflectionUtils.newInstance(valueClass, jobConf);

      int rowCount = 0;
      while (read.next(k, v)) {

        writer.append(k, v);
        rowCount++;
        if (rowCount % 100000 == 0) {
          if (reporter != null) {
            reporter.setStatus(stat + " at row " + rowCount);
            l4j.debug(stat + " at row " + rowCount);
          }
        }
      }
      read.close();
      if (reporter != null) {
        reporter.incrCounter(CrushMapper.CrushCounters.ROWS_WRITTEN, rowCount);
      }
    } // end for
    writer.close();

    l4j.info("crushed file written to " + outPath);
  }
Пример #22
0
  public TableJoinRecordReader(
      JobConf jobConf, CloudataConf conf, TableSplit tableSplit, Reporter reporter)
      throws IOException {
    this.conf = conf;

    String mergeEvaluatorClass = tableSplit.getInputTableInfo().getMergeEvaluatorClass();
    MergeEvaluator mergeEvaluator = null;
    if (mergeEvaluatorClass != null && mergeEvaluatorClass.length() > 0) {
      try {
        mergeEvaluator = (MergeEvaluator) Class.forName(mergeEvaluatorClass).newInstance();
      } catch (Exception e) {
        LOG.error("mergeEvaluator:" + mergeEvaluatorClass + "," + e.getMessage());
        IOException err = new IOException(e.getMessage() + ":" + mergeEvaluatorClass);
        err.initCause(e);
        throw err;
      }
    }

    RowFilter splitRowFilter = tableSplit.getRowFilter();
    InputTableInfo inputTableInfo = tableSplit.getInputTableInfo();

    this.startRowKey = splitRowFilter.getStartRowKey();
    this.endRowKey = splitRowFilter.getEndRowKey();

    RowFilter rowFilter = inputTableInfo.getRowFilter();
    rowFilter.setStartRowKey(startRowKey);
    rowFilter.setEndRowKey(endRowKey);

    CTable ctable = CTable.openTable(conf, inputTableInfo.getTableName());

    TableScanner pivotScanner = null;
    TableScanner targetScanner = null;
    try {
      pivotScanner =
          ScannerFactory.openScanner(ctable, rowFilter, TableScanner.SCANNER_OPEN_TIMEOUT);
      Row.Key firstRowKey = null;
      try {
        // 첫번째 Tablet이 아닌 경우에는 첫번째 row는 무시한다.
        if (!startRowKey.equals(Row.Key.MIN_KEY)) {
          Row pivotRow = pivotScanner.nextRow();
          if (pivotRow == null) {
            end = true;
            return;
          }

          if (firstRowKey == null) {
            firstRowKey = pivotRow.getKey();
          }

          if (firstRowKey.equals(pivotRow.getKey())) {
            pivotRow = pivotScanner.nextRow();
            if (pivotRow == null) {
              end = true;
              return;
            }
          }
          pivotScanner.close();
          rowFilter.setStartRowKey(firstRowKey);
          pivotScanner =
              ScannerFactory.openScanner(ctable, rowFilter, TableScanner.SCANNER_OPEN_TIMEOUT);
        } else {
          firstRowKey = startRowKey;
        }
      } catch (Exception e) {
        if (pivotScanner != null) {
          pivotScanner.close();
        }
        throw e;
      }

      RowFilter joinRowFilter = inputTableInfo.getJoinRowFilter();

      if (mergeEvaluator != null) {
        if (!firstRowKey.equals(Row.Key.MIN_KEY)) {
          joinRowFilter.setStartRowKey(mergeEvaluator.parseTargetRowKey(firstRowKey, 0));
        } else {
          joinRowFilter.setStartRowKey(Row.Key.MIN_KEY);
        }
        if (!rowFilter.getEndRowKey().equals(Row.Key.MAX_KEY)) {
          joinRowFilter.setEndRowKey(mergeEvaluator.parseTargetRowKey(rowFilter.getEndRowKey(), 0));
        } else {
          joinRowFilter.setEndRowKey(Row.Key.MAX_KEY);
        }
      } else {
        joinRowFilter.setStartRowKey(firstRowKey);
        joinRowFilter.setEndRowKey(rowFilter.getEndRowKey());
      }

      reporter.setStatus(
          inputTableInfo.getTableName()
              + ":"
              + startRowKey
              + " ~ "
              + endRowKey
              + ", "
              + inputTableInfo.getJoinTableName()
              + ":"
              + joinRowFilter.getStartRowKey()
              + " ~ "
              + joinRowFilter.getEndRowKey());

      // pivot table의 startRow, endRow에 대응하는 target table의 scanner 생성
      CTable targetTable = CTable.openTable(conf, inputTableInfo.getJoinTableName());
      targetScanner =
          ScannerFactory.openScanner(targetTable, joinRowFilter, TableScanner.SCANNER_OPEN_TIMEOUT);

      this.scanner =
          new MergeScanner(new TableScanner[] {pivotScanner, targetScanner}, mergeEvaluator);
    } catch (Exception e) {
      if (targetScanner != null) {
        targetScanner.close();
      }
      IOException err = new IOException(e.getMessage());
      err.initCause(e);
      throw err;
    }
  }
 @Override
 public RecordReader<LongWritable, Text> getRecordReader(
     InputSplit split, JobConf job, Reporter reporter) throws IOException {
   reporter.setStatus(split.toString());
   return new ExampleRecordReader(job, (FileSplit) split);
 }
Пример #24
0
  // Process an input document with GATE and a Reporter
  public synchronized BehemothDocument[] process(BehemothDocument inputDoc, Reporter reporter) {
    if (reporter != null) reporter.setStatus("GATE : " + inputDoc.getUrl().toString());

    boolean clearBehemothAnnotations = config.getBoolean("gate.deleteBehemothAnnotations", false);

    // process the text passed as value with the application
    // a) create a GATE document based on the text value
    gate.Document gatedocument = null;
    try {

      gatedocument = generateGATEDoc(inputDoc);
      // add it to the current corpus
      corpus.add(gatedocument);
      // get the application and assign the corpus to it
      this.GATEapplication.setCorpus(corpus);
      // process it with GATE
      this.GATEapplication.execute();

      AnnotationSet annots = null;
      if ("".equals(filters.getAnnotationSetName())) annots = gatedocument.getAnnotations();
      else annots = gatedocument.getAnnotations(filters.getAnnotationSetName());

      // enrich the input doc with the annotations from
      // the GATE application
      // transfer the annotations from the GATE document
      // to the Behemoth one using the filters
      List<com.digitalpebble.behemoth.Annotation> beheannotations =
          convertGATEAnnotationsToBehemoth(annots, inputDoc);

      // sort the annotations before adding them?
      Collections.sort(beheannotations);

      // clear the existing behemoth annotations
      if (clearBehemothAnnotations) {
        inputDoc.getAnnotations().clear();
      }

      inputDoc.getAnnotations().addAll(beheannotations);

      // add counters about num of annotations added
      if (reporter != null)
        for (com.digitalpebble.behemoth.Annotation annot : beheannotations) {
          reporter.incrCounter("GATE", annot.getType(), 1);
        }

      // Add the document features from GATE to Behemoth
      Set<String> docFeatFilter = this.filters.getDocFeaturesFilter();
      MapWritable beheMD = inputDoc.getMetadata(true);
      if (docFeatFilter.size() > 0) {
        for (String docFeatName : docFeatFilter) {
          Object featValue = gatedocument.getFeatures().get(docFeatName);
          if (featValue != null) {
            beheMD.put(new Text(docFeatName), new Text(featValue.toString()));
          }
        }
      }

      if (reporter != null) reporter.incrCounter("GATE", "Document", 1);

    } catch (Exception e) {
      LOG.error(inputDoc.getUrl().toString(), e);
      if (reporter != null) reporter.incrCounter("GATE", "Exceptions", 1);
    } finally {
      // remove the document from the corpus again
      corpus.clear();
      // and from memory
      if (gatedocument != null) Factory.deleteResource(gatedocument);
    }
    // currently returns only the input document
    return new BehemothDocument[] {inputDoc};
  }
Пример #25
0
 public void status(String message) {
   reporter.setStatus(message.toString());
 }
 @Override
 public RecordReader<LongWritable, Text> getRecordReader(
     InputSplit split, JobConf conf, Reporter reporter) throws IOException {
   reporter.setStatus(split.toString());
   return new DeprecatedLzoLineRecordReader(conf, (FileSplit) split);
 }
 // constructor used by the old API
 ESRecordReader(
     org.apache.hadoop.mapred.InputSplit split, Configuration job, Reporter reporter) {
   reporter.setStatus(split.toString());
   init((ESInputSplit) split, job);
 }
Пример #28
0
 private void updateStatus(Reporter reporter) {
   reporter.setStatus(getCountString());
 }
Пример #29
0
    @Override
    public void map(LongWritable key, Text value, OutputCollector<Text, Text> oc, Reporter reporter)
        throws IOException {
      BufferedReader fin = null;
      InputStream is = null;
      try {
        String s3Path = value.toString();
        URL url = new URL(s3Path);
        URLConnection conn = url.openConnection();
        conn.setConnectTimeout(20000);
        conn.setReadTimeout(20000);

        is = conn.getInputStream();
        is.read();
        is.read();
        fin = new BufferedReader(new InputStreamReader(new CBZip2InputStream(is), "UTF-8"));
        String currentTitle = "";
        // int cnt = 0;
        String line = null;
        StringWriter merged = null;
        CsvWriter writer;

        while ((line = fin.readLine()) != null) {
          if ("<page>".equals(line.trim())) {
            String secondLine = fin.readLine();
            currentTitle =
                new String(
                    secondLine.substring(
                        secondLine.indexOf(pre) + pre.length(), secondLine.indexOf(suf)));
            secondLine = null;
          }
          if (line.trim().startsWith("{{Infobox")) {
            sb = new StringBuilder();
            merged = new StringWriter();
            writer = new CsvWriter(merged, ',');
            sb.append(line);
            sb.append(sep);
            while (true) {
              line = fin.readLine().trim();
              sb.append(line);
              sb.append(sep);
              if ("}}".equals(line)) {
                sb.append(line);
                sb.append(sep);
                break;
              }
              reporter.progress();
            }
            writer.writeRecord(new String[] {currentTitle, sb.toString()});
            writer.flush();
            oc.collect(new Text(""), new Text(merged.toString()));
            reporter.progress();
            reporter.setStatus(value.toString() + " processed");
            sb = null;
            merged = null;
            writer = null;
          }

          line = null;
        }
      } catch (IOException ioe) {
        reporter.setStatus("This task didn't get fully passed");
      } finally {
        try {
          fin.close();
        } catch (Exception e) {
          e.printStackTrace();
        } finally {
        }
      }
    }
Пример #30
0
    /**
     * Copy a file to a destination.
     *
     * @param srcstat src path and metadata
     * @param dstpath dst path
     * @param reporter
     */
    private void copy(
        FileStatus srcstat,
        Path relativedst,
        OutputCollector<WritableComparable<?>, Text> outc,
        Reporter reporter)
        throws IOException {
      Path absdst = new Path(destPath, relativedst);
      int totfiles = job.getInt(SRC_COUNT_LABEL, -1);
      assert totfiles >= 0 : "Invalid file count " + totfiles;

      // if a directory, ensure created even if empty
      if (srcstat.isDir()) {
        if (destFileSys.exists(absdst)) {
          if (!destFileSys.getFileStatus(absdst).isDir()) {
            throw new IOException("Failed to mkdirs: " + absdst + " is a file.");
          }
        } else if (!destFileSys.mkdirs(absdst)) {
          throw new IOException("Failed to mkdirs " + absdst);
        }
        // TODO: when modification times can be set, directories should be
        // emitted to reducers so they might be preserved. Also, mkdirs does
        // not currently return an error when the directory already exists;
        // if this changes, all directory work might as well be done in reduce
        return;
      }

      if (destFileSys.exists(absdst) && !overwrite && !needsUpdate(srcstat, destFileSys, absdst)) {
        outc.collect(null, new Text("SKIP: " + srcstat.getPath()));
        ++skipcount;
        reporter.incrCounter(Counter.SKIP, 1);
        updateStatus(reporter);
        return;
      }

      Path tmpfile = new Path(job.get(TMP_DIR_LABEL), relativedst);
      long cbcopied = 0L;
      FSDataInputStream in = null;
      FSDataOutputStream out = null;
      try {
        // open src file
        try {
          in = srcstat.getPath().getFileSystem(job).open(srcstat.getPath());
        } catch (IOException e) {
          LOG.error("Failed to open src file " + srcstat.getPath() + ", ignore and return");
          in = null;
          return;
        }
        reporter.incrCounter(Counter.BYTESEXPECTED, srcstat.getLen());
        // open tmp file
        out = create(tmpfile, reporter, srcstat);
        // copy file
        for (int cbread; (cbread = in.read(buffer)) >= 0; ) {
          out.write(buffer, 0, cbread);
          cbcopied += cbread;
          reporter.setStatus(
              String.format("%.2f ", cbcopied * 100.0 / srcstat.getLen())
                  + absdst
                  + " [ "
                  + StringUtils.humanReadableInt(cbcopied)
                  + " / "
                  + StringUtils.humanReadableInt(srcstat.getLen())
                  + " ]");
        }
      } finally {
        checkAndClose(in);
        checkAndClose(out);
      }

      if (cbcopied != srcstat.getLen()) {
        if (srcstat.getLen() == 0 && cbcopied > 0) {
          LOG.info("most likely see a WAL file corruption: " + srcstat.getPath());
        } else {
          throw new IOException(
              "File size not matched: copied "
                  + bytesString(cbcopied)
                  + " to tmpfile (="
                  + tmpfile
                  + ") but expected "
                  + bytesString(srcstat.getLen())
                  + " from "
                  + srcstat.getPath());
        }
      } else {
        if (totfiles == 1) {
          // Copying a single file; use dst path provided by user as destination
          // rather than destination directory, if a file
          Path dstparent = absdst.getParent();
          if (!(destFileSys.exists(dstparent) && destFileSys.getFileStatus(dstparent).isDir())) {
            absdst = dstparent;
          }
        }
        if (destFileSys.exists(absdst) && destFileSys.getFileStatus(absdst).isDir()) {
          throw new IOException(absdst + " is a directory");
        }
        if (!destFileSys.mkdirs(absdst.getParent())) {
          throw new IOException("Failed to craete parent dir: " + absdst.getParent());
        }
        rename(tmpfile, absdst);

        FileStatus dststat = destFileSys.getFileStatus(absdst);
        if (dststat.getLen() != srcstat.getLen()) {
          destFileSys.delete(absdst, false);
          throw new IOException(
              "File size not matched: copied "
                  + bytesString(dststat.getLen())
                  + " to dst (="
                  + absdst
                  + ") but expected "
                  + bytesString(srcstat.getLen())
                  + " from "
                  + srcstat.getPath());
        }
        updatePermissions(srcstat, dststat);
      }

      // report at least once for each file
      ++copycount;
      reporter.incrCounter(Counter.BYTESCOPIED, cbcopied);
      reporter.incrCounter(Counter.COPY, 1);
      updateStatus(reporter);
    }