/**
   * Downloads the given file specified via url to the given canonicalDestination.
   *
   * @param urlSource String
   * @param urlDestination String
   * @throws Exception
   */
  @Override
  public void downloadFile(String urlSource, String urlDestination) throws Exception {

    // sanity check
    if (urlSource == null
        || urlSource.length() == 0
        || urlDestination == null
        || urlDestination.length() == 0) {
      throw new IllegalArgumentException(
          "downloadFile(): urlSource or urlDestination argument is null...");
    }

    // URLs for given parameters
    URL source = new URL(urlSource);
    URL destination = new URL(urlDestination);

    // we have a compressed file
    if (GzipUtils.isCompressedFilename(urlSource)) {
      // downlod to temp destination
      File tempDestinationFile =
          org.apache.commons.io.FileUtils.getFile(
              org.apache.commons.io.FileUtils.getTempDirectory(),
              new File(source.getFile()).getName());
      if (LOG.isInfoEnabled()) {
        LOG.info("downloadFile(), " + urlSource + ", this may take a while...");
      }
      org.apache.commons.io.FileUtils.copyURLToFile(source, tempDestinationFile);
      if (LOG.isInfoEnabled()) {
        LOG.info("downloadFile(), gunzip: we have compressed file, decompressing...");
      }
      // decompress the file
      gunzip(tempDestinationFile.getCanonicalPath());
      if (LOG.isInfoEnabled()) {
        LOG.info("downloadFile(), gunzip complete...");
      }
      // move temp/decompressed file to final destination
      File destinationFile = new File(destination.getFile());
      if (destinationFile.exists()) {
        org.apache.commons.io.FileUtils.forceDelete(destinationFile);
      }
      org.apache.commons.io.FileUtils.moveFile(
          org.apache.commons.io.FileUtils.getFile(
              GzipUtils.getUncompressedFilename(tempDestinationFile.getCanonicalPath())),
          destinationFile);

      // lets cleanup after ourselves - remove compressed file
      tempDestinationFile.delete();
    }
    // uncompressed file, download directry to urlDestination
    else {
      if (LOG.isInfoEnabled()) {
        LOG.info("downloadFile(), " + urlSource + ", this may take a while...");
      }
      org.apache.commons.io.FileUtils.copyURLToFile(
          source, org.apache.commons.io.FileUtils.getFile(destination.getFile()));
    }
  }
  /**
   * @param uri The URI of the file to identify
   * @param request The Identification Request
   * @throws CommandExecutionException When an exception happens during execution
   * @throws CommandExecutionException When an exception happens during archive file input/output
   */
  public final void identify(final URI uri, final IdentificationRequest request)
      throws CommandExecutionException {

    final String newPath = makeContainerURI("gzip", request.getFileName());
    setSlash1("");
    final URI newUri = URI.create(GzipUtils.getUncompressedFilename(uri.toString()));

    final RequestIdentifier identifier = new RequestIdentifier(newUri);
    final RequestMetaData metaData = new RequestMetaData(SIZE, TIME, uri.getPath());
    final GZipIdentificationRequest gzRequest =
        new GZipIdentificationRequest(metaData, identifier, getTmpDir());

    GzipCompressorInputStream gzin = null;
    try {
      gzin = new GzipCompressorInputStream(new FileInputStream(request.getSourceFile()), true);

      expandContainer(gzRequest, gzin, newPath);

    } catch (IOException ioe) {
      System.err.println(ioe + " (" + newPath + ")"); // continue after corrupt archive
    } finally {
      if (gzin != null) {
        try {
          gzin.close();
        } catch (IOException ioe) {
          throw new CommandExecutionException(ioe.getMessage(), ioe);
        }
      }
    }
  }
  /**
   * Returns the contents of the datafile as specified by ImportDataRecord in an DataMatrix. May
   * return null if there is a problem reading the file.
   *
   * @param importDataRecord ImportDataRecord
   * @return DataMatrix
   * @throws Exception
   */
  @Override
  public DataMatrix getFileContents(ImportDataRecord importDataRecord) throws Exception {

    if (LOG.isInfoEnabled()) {
      LOG.info("getFileContents(): " + importDataRecord);
    }

    // determine path to file (does override file exist?)
    String fileCanonicalPath = importDataRecord.getCanonicalPathToData();

    // get filedata inputstream
    InputStream fileContents;

    // data can be compressed
    if (GzipUtils.isCompressedFilename(fileCanonicalPath.toLowerCase())) {
      if (LOG.isInfoEnabled()) {
        LOG.info("getFileContents(): processing file: " + fileCanonicalPath);
      }
      fileContents =
          readContent(
              importDataRecord,
              org.apache.commons.io.FileUtils.openInputStream(new File(fileCanonicalPath)));
    } else {
      if (LOG.isInfoEnabled()) {
        LOG.info("getFileContents(): processing file: " + fileCanonicalPath);
      }
      fileContents = org.apache.commons.io.FileUtils.openInputStream(new File(fileCanonicalPath));
    }

    // outta here
    return getDataMatrix(fileContents);
  }
Exemplo n.º 4
0
 @Override
 public CompressionMode getEffectiveCompressionMode(String filename) {
   if (GzipUtils.isCompressedFilename(filename)) {
     return GZIP;
   } else if (XZUtils.isCompressedFilename(filename)) {
     return XZ;
   } else {
     return NONE;
   }
 }
  /**
   * @return
   * @throws NumberFormatException
   * @throws IOException
   * @throws FileNotFoundException
   */
  private static Word2Vec readBinaryModel(File modelFile)
      throws NumberFormatException, IOException {
    InMemoryLookupTable lookupTable;
    VocabCache cache;
    INDArray syn0;
    int words, size;
    try (BufferedInputStream bis =
            new BufferedInputStream(
                GzipUtils.isCompressedFilename(modelFile.getName())
                    ? new GZIPInputStream(new FileInputStream(modelFile))
                    : new FileInputStream(modelFile));
        DataInputStream dis = new DataInputStream(bis)) {
      words = Integer.parseInt(readString(dis));
      size = Integer.parseInt(readString(dis));
      syn0 = Nd4j.create(words, size);
      cache = new InMemoryLookupCache(false);
      lookupTable =
          (InMemoryLookupTable)
              new InMemoryLookupTable.Builder().cache(cache).vectorLength(size).build();

      String word;
      for (int i = 0; i < words; i++) {

        word = readString(dis);
        log.trace("Loading " + word + " with word " + i);
        if (word.isEmpty()) {
          continue;
        }

        float[] vector = new float[size];

        for (int j = 0; j < size; j++) {
          vector[j] = readFloat(dis);
        }

        syn0.putRow(i, Transforms.unitVec(Nd4j.create(vector)));

        cache.addWordToIndex(cache.numWords(), word);
        cache.addToken(new VocabWord(1, word));
        cache.putVocabWord(word);
      }
    }

    Word2Vec ret = new Word2Vec();

    lookupTable.setSyn0(syn0);
    ret.setVocab(cache);
    ret.setLookupTable(lookupTable);
    return ret;
  }
  /**
   * Helper function to gunzip file. gzipFile param is canonical path.
   *
   * @param gzipFile String
   */
  private static void gunzip(String gzipFile) throws Exception {

    // setup our gzip inputs tream
    FileOutputStream fos = null;
    String outFilePath = GzipUtils.getUncompressedFilename(gzipFile);
    GZIPInputStream gis = new GZIPInputStream(new FileInputStream(gzipFile));

    try {
      // unzip into file less the .gz
      fos = new FileOutputStream(outFilePath);
      IOUtils.copy(gis, fos);
    } finally {
      // close up our streams
      IOUtils.closeQuietly(gis);
      if (fos != null) IOUtils.closeQuietly(fos);
    }
  }