Esempio n. 1
0
  @SuppressWarnings("unchecked")
  public static <V extends Writable> List<V> readValues(Path path, FileSystem fs, int max) {
    List<V> list = new ArrayList<V>();

    try {
      int k = 0;
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());

      Writable key = (Writable) reader.getKeyClass().newInstance();
      V value = (V) reader.getValueClass().newInstance();

      while (reader.next(key, value)) {
        k++;
        list.add(value);
        if (k >= max) {
          break;
        }

        value = (V) reader.getValueClass().newInstance();
      }
      reader.close();
    } catch (Exception e) {
      throw new RuntimeException("Error reading SequenceFile " + path);
    }

    return list;
  }
Esempio n. 2
0
  /**
   * Reads key-value pairs from a SequenceFile, up to a maximum number.
   *
   * @param path path to file
   * @param max maximum of key-value pairs to read
   * @return list of key-value pairs
   */
  @SuppressWarnings("unchecked")
  public static <K extends Writable, V extends Writable> List<PairOfWritables<K, V>> readFile(
      Path path, FileSystem fs, int max) throws IOException {
    List<PairOfWritables<K, V>> list = new ArrayList<PairOfWritables<K, V>>();

    try {
      int k = 0;
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());

      K key;
      V value;

      key = (K) reader.getKeyClass().newInstance();
      value = (V) reader.getValueClass().newInstance();

      while (reader.next(key, value)) {
        k++;
        list.add(new PairOfWritables<K, V>(key, value));
        if (k >= max) {
          break;
        }

        // Create new objects, because the key, value gets reused
        key = (K) reader.getKeyClass().newInstance();
        value = (V) reader.getValueClass().newInstance();
      }
      reader.close();
    } catch (IllegalAccessException e) {
      throw new RuntimeException("Error reading SequenceFile: " + e);
    } catch (InstantiationException e) {
      throw new RuntimeException("Error reading SequenceFile: " + e);
    }

    return list;
  }
Esempio n. 3
0
    private static void loadPairs(
        HMapIV<ArrayListOfIntsWritable> pwsimMapping, int langID, JobConf job, Reporter reporter) {
      try {
        Path[] localFiles = DistributedCache.getLocalCacheFiles(job);
        String pwsimFile = job.get("PwsimPairs");
        for (Path localFile : localFiles) {
          if (localFile.toString().contains(getFilename(pwsimFile))) {
            SequenceFile.Reader reader =
                new SequenceFile.Reader(FileSystem.getLocal(job), localFile, job);

            PairOfInts key = (PairOfInts) reader.getKeyClass().newInstance();
            IntWritable value = (IntWritable) reader.getValueClass().newInstance();
            int cnt = 0;
            while (reader.next(key, value)) {
              int fDocno = key.getRightElement();
              int eDocno = key.getLeftElement();
              if ((eDocno == 6127 && fDocno == 1000000074)
                  || (eDocno == 6127 && fDocno == 1000000071)) {
                sLogger.info(key);
              }
              if (langID == CLIRUtils.E) {
                if (!pwsimMapping.containsKey(eDocno)) {
                  pwsimMapping.put(eDocno, new ArrayListOfIntsWritable());
                }
                pwsimMapping
                    .get(eDocno)
                    .add(
                        fDocno); // we add 1000000000 to foreign docnos to distinguish them during
                                 // pwsim algo
              } else {
                if (!pwsimMapping.containsKey(fDocno)) {
                  pwsimMapping.put(fDocno, new ArrayListOfIntsWritable());
                }
                pwsimMapping
                    .get(fDocno)
                    .add(
                        eDocno); // we add 1000000000 to foreign docnos to distinguish them during
                                 // pwsim algo
              }
              cnt++;
              key = (PairOfInts) reader.getKeyClass().newInstance();
              value = (IntWritable) reader.getValueClass().newInstance();
            }
            reader.close();
            sLogger.info(pwsimMapping.size() + "," + cnt + " pairs loaded from " + localFile);
          }
        }
      } catch (Exception e) {
        throw new RuntimeException(e);
      }
    }
Esempio n. 4
0
  public static XYSeries getXY(String url) {
    XYSeries xyseries = new XYSeries("");

    Path path = new Path(url);
    Configuration conf = HUtils.getConf();
    SequenceFile.Reader reader = null;
    try {
      reader =
          new SequenceFile.Reader(
              conf, Reader.file(path), Reader.bufferSize(4096), Reader.start(0));
      DoubleArrStrWritable dkey =
          (DoubleArrStrWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
      DoublePairWritable dvalue =
          (DoublePairWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf);

      while (reader.next(dkey, dvalue)) { // 循环读取文件
        xyseries.add(dvalue.getFirst(), dvalue.getSecond());
      }
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      IOUtils.closeStream(reader);
    }
    return xyseries;
  }
Esempio n. 5
0
  /**
   * return the x*y
   *
   * @param url
   * @return
   */
  public Double[] getR(String url) {
    List<Double> list = new ArrayList<Double>();
    Path path = new Path(url);
    Configuration conf = HUtils.getConf();
    SequenceFile.Reader reader = null;
    try {
      reader =
          new SequenceFile.Reader(
              conf, Reader.file(path), Reader.bufferSize(4096), Reader.start(0));
      DoubleArrStrWritable dkey =
          (DoubleArrStrWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
      DoublePairWritable dvalue =
          (DoublePairWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf);

      while (reader.next(dkey, dvalue)) { // 循环读取文件
        //				list.add(dvalue.getSum()*dvalue.getDistance());
      }
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      IOUtils.closeStream(reader);
    }
    Double[] dList = new Double[list.size()];
    dList = list.toArray(dList);
    Arrays.sort(dList);
    return dList;
  }
Esempio n. 6
0
  private static Map<Integer, List<WeightedVectorWritable>> readPoints(
      Path pointsPathDir, Configuration conf) throws IOException {
    Map<Integer, List<WeightedVectorWritable>> result =
        new TreeMap<Integer, List<WeightedVectorWritable>>();

    FileSystem fs = pointsPathDir.getFileSystem(conf);
    FileStatus[] children =
        fs.listStatus(
            pointsPathDir,
            new PathFilter() {
              @Override
              public boolean accept(Path path) {
                return !(path.getName().endsWith(".crc") || "_logs".equals(path.getName()));
              }
            });

    for (FileStatus file : children) {
      Path path = file.getPath();
      SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
      try {
        IntWritable key = reader.getKeyClass().asSubclass(IntWritable.class).newInstance();
        WeightedVectorWritable value =
            reader.getValueClass().asSubclass(WeightedVectorWritable.class).newInstance();
        while (reader.next(key, value)) {
          // value is the cluster id as an int, key is the name/id of the
          // vector, but that doesn't matter because we only care about printing
          // it
          // String clusterId = value.toString();
          List<WeightedVectorWritable> pointList = result.get(key.get());
          if (pointList == null) {
            pointList = new ArrayList<WeightedVectorWritable>();
            result.put(key.get(), pointList);
          }
          pointList.add(value);
          value = reader.getValueClass().asSubclass(WeightedVectorWritable.class).newInstance();
        }
      } catch (InstantiationException e) {
        log.error("Exception", e);
      } catch (IllegalAccessException e) {
        log.error("Exception", e);
      }
    }

    return result;
  }
Esempio n. 7
0
 public SeqFileInputStream(FileSystem fs, FileStatus f) throws IOException {
   r = new SequenceFile.Reader(fs, f.getPath(), getConf());
   key =
       ReflectionUtils.newInstance(
           r.getKeyClass().asSubclass(WritableComparable.class), getConf());
   val = ReflectionUtils.newInstance(r.getValueClass().asSubclass(Writable.class), getConf());
   inbuf = new DataInputBuffer();
   outbuf = new DataOutputBuffer();
 }
  private static int readSequenceFile(Path path, FileSystem fs, int max) throws IOException {
    SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, fs.getConf());

    System.out.println("Reading " + path + "...\n");
    try {
      System.out.println("Key type: " + reader.getKeyClass().toString());
      System.out.println("Value type: " + reader.getValueClass().toString() + "\n");
    } catch (Exception e) {
      throw new RuntimeException("Error: loading key/value class");
    }

    Writable key = null, value;
    int n = 0;
    try {
      if (Tuple.class.isAssignableFrom(reader.getKeyClass())) {
        key = TUPLE_FACTORY.newTuple();
      } else {
        key = (Writable) reader.getKeyClass().newInstance();
      }

      if (Tuple.class.isAssignableFrom(reader.getValueClass())) {
        value = TUPLE_FACTORY.newTuple();
      } else {
        value = (Writable) reader.getValueClass().newInstance();
      }

      while (reader.next(key, value)) {
        System.out.println("Record " + n);
        System.out.println("Key: " + key + "\nValue: " + value);
        System.out.println("----------------------------------------");
        n++;

        if (n >= max) break;
      }
      reader.close();
      System.out.println(n + " records read.\n");
    } catch (Exception e) {
      e.printStackTrace();
    }

    return n;
  }
Esempio n. 9
0
  private static List<IDistanceDensityMul> getIDistanceDensityMulList(String url)
      throws FileNotFoundException, IOException {
    Configuration conf = HUtils.getConf();
    SequenceFile.Reader reader = null;
    // 多个文件整合,需排序
    List<IDistanceDensityMul> allList = new ArrayList<IDistanceDensityMul>();
    // 单个文件
    List<IDistanceDensityMul> fileList = new ArrayList<IDistanceDensityMul>();

    FileStatus[] fss =
        HUtils.getHDFSPath(url, "true")
            .getFileSystem(conf)
            .listStatus(HUtils.getHDFSPath(url, "true"));
    for (FileStatus f : fss) {
      if (!f.toString().contains("part")) {
        continue; // 排除其他文件
      }
      try {
        reader =
            new SequenceFile.Reader(
                conf, Reader.file(f.getPath()), Reader.bufferSize(4096), Reader.start(0));
        //				 <density_i*min_distancd_j> <first:density_i,second:min_distance_j,third:i>
        //				 	DoubleWritable,  IntDoublePairWritable
        CustomDoubleWritable dkey =
            (CustomDoubleWritable) ReflectionUtils.newInstance(reader.getKeyClass(), conf);
        IntDoublePairWritable dvalue =
            (IntDoublePairWritable) ReflectionUtils.newInstance(reader.getValueClass(), conf);
        int i = Utils.GETDRAWPICRECORDS_EVERYFILE;
        while (reader.next(dkey, dvalue) && i > 0) { // 循环读取文件
          i--;
          fileList.add(
              new IDistanceDensityMul(
                  dvalue.getSecond(),
                  dvalue.getFirst(),
                  dvalue.getThird(),
                  dkey.get())); // 每个文件都是从小到大排序的
        }
      } catch (Exception e) {
        e.printStackTrace();
      } finally {
        IOUtils.closeStream(reader);
      }

      // 整合当前文件的前面若干条记录(Utils.GETDRAWPICRECORDS_EVERYFILE 	)
      if (allList.size() <= 0) { // 第一次可以全部添加
        allList.addAll(fileList);
      } else {
        combineLists(allList, fileList);
      }
    } // for
    // 第一个点太大了,选择去掉
    return allList.subList(1, allList.size());
  }
Esempio n. 10
0
    @Override
    protected boolean doProcess(Record inputRecord, final InputStream in) throws IOException {
      SequenceFile.Metadata sequenceFileMetaData = null;
      SequenceFile.Reader reader = null;
      try {
        reader =
            new SequenceFile.Reader(
                conf,
                SequenceFile.Reader.stream(new FSDataInputStream(new ForwardOnlySeekable(in))));

        if (includeMetaData) {
          sequenceFileMetaData = reader.getMetadata();
        }
        Class keyClass = reader.getKeyClass();
        Class valueClass = reader.getValueClass();
        Record template = inputRecord.copy();
        removeAttachments(template);

        while (true) {
          Writable key = (Writable) ReflectionUtils.newInstance(keyClass, conf);
          Writable val = (Writable) ReflectionUtils.newInstance(valueClass, conf);
          try {
            if (!reader.next(key, val)) {
              break;
            }
          } catch (EOFException ex) {
            // SequenceFile.Reader will throw an EOFException after reading
            // all the data, if it doesn't know the length.  Since we are
            // passing in an InputStream, we hit this case;
            LOG.trace("Received expected EOFException", ex);
            break;
          }
          incrementNumRecords();
          Record outputRecord = template.copy();
          outputRecord.put(keyField, key);
          outputRecord.put(valueField, val);
          outputRecord.put(Fields.ATTACHMENT_MIME_TYPE, OUTPUT_MEDIA_TYPE);
          if (includeMetaData && sequenceFileMetaData != null) {
            outputRecord.put(SEQUENCE_FILE_META_DATA, sequenceFileMetaData);
          }

          // pass record to next command in chain:
          if (!getChild().process(outputRecord)) {
            return false;
          }
        }
      } finally {
        Closeables.closeQuietly(reader);
      }
      return true;
    }
Esempio n. 11
0
 private int getMessageCount(LogFilePath logFilePath) throws Exception {
   String path = logFilePath.getLogFilePath();
   Path fsPath = new Path(path);
   FileSystem fileSystem = FileUtil.getFileSystem(path);
   SequenceFile.Reader reader = new SequenceFile.Reader(fileSystem, fsPath, new Configuration());
   LongWritable key = (LongWritable) reader.getKeyClass().newInstance();
   BytesWritable value = (BytesWritable) reader.getValueClass().newInstance();
   int result = 0;
   while (reader.next(key, value)) {
     result++;
   }
   reader.close();
   return result;
 }
Esempio n. 12
0
 public static void main(String[] args) throws Exception {
   // TODO Auto-generated method stub
   String mapUri = args[0];
   Configuration conf = new Configuration();
   FileSystem fs = FileSystem.get(URI.create(mapUri), conf);
   Path map = new Path(mapUri);
   Path mapData = new Path(mapUri, MapFile.DATA_FILE_NAME);
   SequenceFile.Reader reader = new SequenceFile.Reader(conf, SequenceFile.Reader.file(mapData));
   Class keyClass = reader.getKeyClass();
   Class valueClass = reader.getValueClass();
   reader.close();
   long entries = MapFile.fix(fs, map, keyClass, valueClass, false, conf);
   System.out.printf("Created MapFile %s with %d entries\n", map, entries);
 }
Esempio n. 13
0
 private void getOffsets(LogFilePath logFilePath, Set<Long> offsets) throws Exception {
   String path = logFilePath.getLogFilePath();
   Path fsPath = new Path(path);
   FileSystem fileSystem = FileUtil.getFileSystem(path);
   SequenceFile.Reader reader = new SequenceFile.Reader(fileSystem, fsPath, new Configuration());
   LongWritable key = (LongWritable) reader.getKeyClass().newInstance();
   BytesWritable value = (BytesWritable) reader.getValueClass().newInstance();
   while (reader.next(key, value)) {
     if (!offsets.add(key.get())) {
       throw new RuntimeException(
           "duplicate key " + key.get() + " found in file " + logFilePath.getLogFilePath());
     }
   }
   reader.close();
 }
Esempio n. 14
0
 @Override
 public long next(HdfsInputStream hdfsistr, Holder<Object> key, Holder<Object> value) {
   try {
     SequenceFile.Reader reader = (SequenceFile.Reader) hdfsistr.getIn();
     Holder<Integer> keySize = new Holder<Integer>();
     Writable keyWritable =
         (Writable) ReflectionUtils.newInstance(reader.getKeyClass(), new Configuration());
     Holder<Integer> valueSize = new Holder<Integer>();
     Writable valueWritable =
         (Writable) ReflectionUtils.newInstance(reader.getValueClass(), new Configuration());
     if (reader.next(keyWritable, valueWritable)) {
       key.value = getObject(keyWritable, keySize);
       value.value = getObject(valueWritable, valueSize);
       return keySize.value + valueSize.value;
     } else {
       return 0;
     }
   } catch (Exception ex) {
     throw new RuntimeCamelException(ex);
   }
 }
Esempio n. 15
0
  public void printClusters(String[] dictionary)
      throws IOException, InstantiationException, IllegalAccessException {
    Configuration conf = new Configuration();

    if (this.termDictionary != null) {
      if ("text".equals(dictionaryFormat)) {
        dictionary = VectorHelper.loadTermDictionary(new File(this.termDictionary));
      } else if ("sequencefile".equals(dictionaryFormat)) {
        FileSystem fs = FileSystem.get(new Path(this.termDictionary).toUri(), conf);
        dictionary = VectorHelper.loadTermDictionary(conf, fs, this.termDictionary);
      } else {
        throw new IllegalArgumentException("Invalid dictionary format");
      }
    }

    Writer writer =
        this.outputFile == null
            ? new OutputStreamWriter(System.out)
            : new FileWriter(this.outputFile);
    try {
      FileSystem fs = seqFileDir.getFileSystem(conf);
      for (FileStatus seqFile : fs.globStatus(new Path(seqFileDir, "part-*"))) {
        Path path = seqFile.getPath();
        // System.out.println("Input Path: " + path); doesn't this interfere with output?
        SequenceFile.Reader reader = new SequenceFile.Reader(fs, path, conf);
        try {
          Writable key = reader.getKeyClass().asSubclass(Writable.class).newInstance();
          Writable value = reader.getValueClass().asSubclass(Writable.class).newInstance();
          while (reader.next(key, value)) {
            Cluster cluster = (Cluster) value;
            String fmtStr = useJSON ? cluster.asJsonString() : cluster.asFormatString(dictionary);
            if (subString > 0 && fmtStr.length() > subString) {
              writer.write(':');
              writer.write(fmtStr, 0, Math.min(subString, fmtStr.length()));
            } else {
              writer.write(fmtStr);
            }

            writer.write('\n');

            if (dictionary != null) {
              String topTerms = getTopFeatures(cluster.getCenter(), dictionary, numTopFeatures);
              writer.write("\tTop Terms: ");
              writer.write(topTerms);
              writer.write('\n');
            }

            List<WeightedVectorWritable> points = clusterIdToPoints.get(cluster.getId());
            if (points != null) {
              writer.write("\tWeight:  Point:\n\t");
              for (Iterator<WeightedVectorWritable> iterator = points.iterator();
                  iterator.hasNext(); ) {
                WeightedVectorWritable point = iterator.next();
                writer.write(String.valueOf(point.getWeight()));
                writer.write(": ");
                writer.write(AbstractCluster.formatVector(point.getVector(), dictionary));
                if (iterator.hasNext()) {
                  writer.write("\n\t");
                }
              }
              writer.write('\n');
            }
          }
        } finally {
          reader.close();
        }
      }
    } finally {
      writer.close();
    }
  }
Esempio n. 16
0
  protected void sequenceCrush(FileSystem fs, FileStatus[] status)
      throws IOException, CrushException {
    l4j.info("Sequence file crushing activated");
    Class keyClass = null;
    Class valueClass = null;
    SequenceFile.Writer writer = null;
    for (FileStatus stat : status) {
      if (reporter != null) {
        reporter.setStatus("Crushing on " + stat.getPath());
        l4j.info("Current file " + stat.getPath());
        l4j.info("length " + stat.getLen());
        reporter.incrCounter(CrushMapper.CrushCounters.FILES_CRUSHED, 1);
      }
      Path p1 = stat.getPath();
      SequenceFile.Reader read = new SequenceFile.Reader(fs, p1, jobConf);
      if (keyClass == null) {
        keyClass = read.getKeyClass();
        valueClass = read.getValueClass();
        writer =
            SequenceFile.createWriter(
                fs, jobConf, outPath, keyClass, valueClass, this.compressionType, this.codec);
      } else {
        if (!(keyClass.equals(read.getKeyClass()) && valueClass.equals(read.getValueClass()))) {
          read.close();
          writer.close();
          throw new CrushException(
              "File  "
                  + stat.getPath()
                  + " keyClass "
                  + read.getKeyClass()
                  + " valueClass "
                  + read.getValueClassName()
                  + " does not match"
                  + " other files in folder");
        }
      }

      Writable k = (Writable) ReflectionUtils.newInstance(keyClass, jobConf);
      Writable v = (Writable) ReflectionUtils.newInstance(valueClass, jobConf);

      int rowCount = 0;
      while (read.next(k, v)) {

        writer.append(k, v);
        rowCount++;
        if (rowCount % 100000 == 0) {
          if (reporter != null) {
            reporter.setStatus(stat + " at row " + rowCount);
            l4j.debug(stat + " at row " + rowCount);
          }
        }
      }
      read.close();
      if (reporter != null) {
        reporter.incrCounter(CrushMapper.CrushCounters.ROWS_WRITTEN, rowCount);
      }
    } // end for
    writer.close();

    l4j.info("crushed file written to " + outPath);
  }