public DoubleMatrix getScoreMatrix(File file) {
    Counter<String> docWords = new Counter<String>();
    try {
      LineIterator iter = FileUtils.lineIterator(file);
      while (iter.hasNext()) {
        Tokenizer t =
            tokenizerFactory.create((new InputHomogenization(iter.nextLine()).transform()));
        while (t.hasMoreTokens()) {
          docWords.incrementCount(t.nextToken(), 1.0);
        }
      }

      iter.close();
    } catch (IOException e) {
      throw new IllegalStateException("Unable to read file", e);
    }
    DoubleMatrix ret = new DoubleMatrix(1, currVocab.size());

    for (int i = 0; i < currVocab.size(); i++) {
      if (docWords.getCount(currVocab.get(i).toString()) > 0) {
        ret.put(i, wordScores.getCount(currVocab.get(i).toString()));
      }
    }

    return ret;
  }
 @Override
 protected void setUp() throws Exception {
   inMemorySenseiService =
       InMemorySenseiService.valueOf(
           new File(
               InMemoryIndexPerfTest.class
                   .getClassLoader()
                   .getResource("test-conf/node1/")
                   .toURI()));
   LineIterator lineIterator =
       FileUtils.lineIterator(
           new File(
               InMemoryIndexPerfTest.class
                   .getClassLoader()
                   .getResource("data/test_data.json")
                   .toURI()));
   int i = 0;
   docs = new ArrayList<JSONObject>();
   while (lineIterator.hasNext() && i < 100) {
     String car = lineIterator.next();
     if (car != null && car.contains("{")) docs.add(new JSONObject(car));
     i++;
   }
   lineIterator.close();
 }
  public static void main(String[] args) throws IOException {

    String workDir = "E:/dev_workspace/tmp/workspace/duc2007";
    String idfFilename = "duc2007.idf";

    final double TOTAL_PAGE_COUNT = 30000000000.0D;

    Map<String, Double> idfValues = new HashMap<String, Double>();
    File idfFIle = FileUtils.getFile(workDir + "/" + DIR_IDF_FILE, idfFilename);
    log.info("Loading idf value file[" + idfFIle.getAbsolutePath() + "]");
    LineIterator lineIterator = null;
    try {
      lineIterator = FileUtils.lineIterator(idfFIle, DEFAULT_CHARSET.toString());
      while (lineIterator.hasNext()) {
        String line = lineIterator.nextLine();
        String[] strs = line.split("###");
        if (strs.length != 2) {
          log.warn("Line[" + line + "] format is illegal, ignore it!");
          continue;
        }
        idfValues.put(strs[0].trim(), Long.parseLong(strs[1]) / TOTAL_PAGE_COUNT);
      }
      log.info("Load idf value file[" + idfFIle.getAbsolutePath() + "] finished!");
    } catch (IOException e) {
      log.error("Load idf value file[" + idfFIle.getAbsolutePath() + "] error!", e);
      throw e;
    } finally {
      if (lineIterator != null) {
        lineIterator.close();
      }
    }

    String question =
        "Describe the legal battle between various recording artists and members of the record industry and the Internet music site Napster. What support, or lack thereof, have the litigants received?";

    EhCacheUtil ehCacheUtil = new EhCacheUtil("db_cache_vec", "lab");

    SummaryBuilderByVector summaryBuilder =
        new SummaryBuilderByVector(
            workDir, "0", "D0714D.txt", 10, idfValues, question, ehCacheUtil, 1.0f, 1.6f);
    ExecutorService es = Executors.newSingleThreadExecutor();
    Future<Boolean> future = es.submit(summaryBuilder);
    try {
      future.get();
    } catch (InterruptedException | ExecutionException e) {
      e.printStackTrace();
    }
    es.shutdown();
    EhCacheUtil.close();
  }
Esempio n. 4
0
 /** 打印帮助信息 */
 private static void showHelpInfo() {
   String helpfile =
       System.getProperty("user.dir") + File.separator + "conf" + File.separator + "help.info";
   File f = new File(helpfile);
   if (!f.exists()) {
     System.out.println("help.info not exists");
   } else {
     try {
       LineIterator itr = FileUtils.lineIterator(f, "UTF-8");
       while (itr.hasNext()) {
         System.out.println(itr.nextLine());
       }
       itr.close();
     } catch (IOException e) {
       e.printStackTrace();
     }
   }
 }
Esempio n. 5
0
  /**
   * Creates a hash code from the source code of the warning line and the surrounding context.
   *
   * @param fileName the absolute path of the file to read
   * @param line the line of the warning
   * @param encoding the encoding of the file, if <code>null</code> or empty then the default
   *     encoding of the platform is used
   * @return a has code of the source code
   * @throws IOException if the contents of the file could not be read
   */
  public int create(final String fileName, final int line, final String encoding)
      throws IOException {
    LineIterator lineIterator = EncodingValidator.readFile(fileName, encoding);

    StringBuilder context = new StringBuilder(1000);
    for (int i = 0; lineIterator.hasNext(); i++) {
      String currentLine = lineIterator.nextLine();
      if (i >= line - 3) {
        context.append(currentLine);
      }
      if (i > line + 3) {
        break;
      }
    }
    lineIterator.close();

    return context.toString().hashCode();
  }
  /**
   * Loads an in memory cache from the given path (sets syn0 and the vocab)
   *
   * @param vectorsFile the path of the file to load
   * @return
   * @throws FileNotFoundException
   */
  public static Pair<InMemoryLookupTable, VocabCache> loadTxt(File vectorsFile)
      throws FileNotFoundException {
    BufferedReader write = new BufferedReader(new FileReader(vectorsFile));
    VocabCache cache = new InMemoryLookupCache();

    InMemoryLookupTable lookupTable;

    LineIterator iter = IOUtils.lineIterator(write);
    List<INDArray> arrays = new ArrayList<>();
    while (iter.hasNext()) {
      String line = iter.nextLine();
      String[] split = line.split(" ");
      String word = split[0];
      VocabWord word1 = new VocabWord(1.0, word);
      cache.addToken(word1);
      cache.addWordToIndex(cache.numWords(), word);
      word1.setIndex(cache.numWords());
      cache.putVocabWord(word);
      INDArray row = Nd4j.create(Nd4j.createBuffer(split.length - 1));
      for (int i = 1; i < split.length; i++) {
        row.putScalar(i - 1, Float.parseFloat(split[i]));
      }
      arrays.add(row);
    }

    INDArray syn = Nd4j.create(new int[] {arrays.size(), arrays.get(0).columns()});
    for (int i = 0; i < syn.rows(); i++) {
      syn.putRow(i, arrays.get(i));
    }

    lookupTable =
        (InMemoryLookupTable)
            new InMemoryLookupTable.Builder()
                .vectorLength(arrays.get(0).columns())
                .useAdaGrad(false)
                .cache(cache)
                .build();
    Nd4j.clearNans(syn);
    lookupTable.setSyn0(syn);

    iter.close();

    return new Pair<>(lookupTable, cache);
  }
Esempio n. 7
0
  public void load() throws IOException {
    log.info("Loading lexicon...");
    File dataFile = new File("data/lexicon.txt.gz");
    Reader reader =
        new BufferedReader(
            new InputStreamReader(new GZIPInputStream(new FileInputStream(dataFile))));

    LineIterator iterator = IOUtils.lineIterator(reader);

    while (iterator.hasNext()) {
      String line = iterator.nextLine();
      String[] splits = line.split("\\s");
      for (int x = 1; x < splits.length; ++x) {
        POSTag tag = POSTag.fromString(splits[x]);
        if (tag == null) log.warn("Unknown tag: {0}", splits[x]);
        else lexiconMap.put(splits[0], tag);
      }
    }

    iterator.close();
    log.info("Lexicon loaded!");
  }
  private TitleNameNormalizer(String pathToEvaluationRedirectsData) throws IOException {

    if (useBloomFilter) {
      redirectFilter = BloomFilter.create(Funnels.stringFunnel(), ESTIMATED_REDIRECTS);
      redirects =
          new LRUCache<String, String>(5000) {
            protected String loadValue(String src) {
              String normalized = TitleNameIndexer.normalize(src);
              if (normalized == null) return src;
              return TitleNameIndexer.normalize(src);
            }
          };
    } else redirects = new StringMap<String>();
    if (showInitProgress)
      System.out.println(
          "Loading the most recent redirect pages from Wikipedia to normalize the output links to the latest version");
    if (pathToEvaluationRedirectsData != null) {
      InputStream is = CompressionUtils.readSnappyCompressed(pathToEvaluationRedirectsData);
      LineIterator iterator = IOUtils.lineIterator(is, StandardCharsets.UTF_8);

      long linecount = 0;
      while (iterator.hasNext()) {
        String line = iterator.nextLine();
        if (showInitProgress && linecount++ % 100000 == 0)
          System.out.println("loading the latest redirects; linecount=" + linecount);
        String[] parts = StringUtils.split(line, '\t');

        String src = parts[0].trim().replace(' ', '_');
        String trg = parts[1].trim().replace(' ', '_');
        if (useBloomFilter) redirectFilter.put(src);
        else redirects.put(src, trg);
      }
      iterator.close();
    }
    redirects = Collections.unmodifiableMap(redirects);
    if (showInitProgress)
      System.out.println(
          "Done  - Loading the most recent redirect pages from Wikipedia to normalize the output links to the latest version");
  }
 @Override
 protected void doClose() throws Exception {
   lineIterator.close();
 }
  /**
   * Runs a MAF file through the Oncotator and OMA tools.
   *
   * @param inputMAFURL String
   * @param outputMAFURL String
   * @throws Exception
   */
  @Override
  public void oncotateMAF(String inputMAFURL, String outputMAFURL) throws Exception {

    // sanity check
    if (inputMAFURL == null
        || inputMAFURL.length() == 0
        || outputMAFURL == null
        || outputMAFURL.length() == 0) {
      throw new IllegalArgumentException(
          "oncotateMAFdownloadFile(): url or urlDestination argument is null...");
    }

    URL inputMAF = new URL(inputMAFURL);
    URL outputMAF = new URL(outputMAFURL);

    // determine if we have to call liftover
    boolean cleanOncotatorInputFile = false;
    File oncotatorInputFile = new File(inputMAF.getFile());
    org.apache.commons.io.LineIterator it =
        org.apache.commons.io.FileUtils.lineIterator(oncotatorInputFile);
    it.nextLine(); // skip header
    String[] parts = it.nextLine().split("\t");
    if (parts[3].contains("36") || parts[3].equals("hg18")) {
      it.close();
      File liftoverInputFile =
          org.apache.commons.io.FileUtils.getFile(
              org.apache.commons.io.FileUtils.getTempDirectory(), "liftoverInputFile");
      org.apache.commons.io.FileUtils.copyFile(oncotatorInputFile, liftoverInputFile);
      oncotatorInputFile = new File(inputMAF.getFile());
      // call lift over
      if (LOG.isInfoEnabled()) {
        LOG.info("oncotateMAF(), calling Hg18ToHg19...");
      }
      Hg18ToHg19.driver(
          liftoverInputFile.getCanonicalPath(),
          oncotatorInputFile.getCanonicalPath(),
          getLiftOverBinary(),
          getLiftOverChain());
      org.apache.commons.io.FileUtils.forceDelete(liftoverInputFile);
      cleanOncotatorInputFile = true;
    }

    // create a temp output file from the oncotator
    File oncotatorOutputFile =
        org.apache.commons.io.FileUtils.getFile(
            org.apache.commons.io.FileUtils.getTempDirectory(), "oncotatorOutputFile");
    // call oncotator
    if (LOG.isInfoEnabled()) {
      LOG.info("oncotateMAF(), calling OncotateTool...");
    }
    OncotateTool.driver(
        oncotatorInputFile.getCanonicalPath(),
        oncotatorOutputFile.getCanonicalPath(),
        true,
        true,
        true);
    // we call OMA here -
    // we use output from oncotator as input file
    if (LOG.isInfoEnabled()) {
      LOG.info("oncotateMAF(), calling MutationAssessorTool...");
    }
    File outputMAFFile = new File(outputMAF.getFile());
    outputMAFFile.createNewFile();
    MutationAssessorTool.driver(
        oncotatorOutputFile.getCanonicalPath(),
        outputMAFFile.getCanonicalPath(),
        false,
        true,
        true);

    // clean up
    org.apache.commons.io.FileUtils.forceDelete(oncotatorOutputFile);
    if (cleanOncotatorInputFile) org.apache.commons.io.FileUtils.forceDelete(oncotatorInputFile);
  }
  /**
   * Get the case list from the staging file.
   *
   * @param caseIDs CaseIDs;
   * @param portalMetadata PortalMetadata
   * @param cancerStudyMetadata CancerStudyMetadata
   * @param stagingFilename String
   * @return List<String>
   * @throws Exception
   */
  @Override
  public List<String> getCaseListFromStagingFile(
      CaseIDs caseIDs,
      PortalMetadata portalMetadata,
      CancerStudyMetadata cancerStudyMetadata,
      String stagingFilename)
      throws Exception {

    if (LOG.isInfoEnabled()) {
      LOG.info("getCaseListFromStagingFile(): " + stagingFilename);
    }

    // we use set here
    HashSet<String> caseSet = new HashSet<String>();

    // staging file
    File stagingFile =
        org.apache.commons.io.FileUtils.getFile(
            portalMetadata.getStagingDirectory(),
            cancerStudyMetadata.getStudyPath(),
            stagingFilename);
    // sanity check
    if (!stagingFile.exists()) {
      return new ArrayList<String>();
    }

    // iterate over all rows in file
    org.apache.commons.io.LineIterator it =
        org.apache.commons.io.FileUtils.lineIterator(stagingFile);
    try {
      int mafCaseIDColumnIndex = 0;
      boolean processHeader = true;
      while (it.hasNext()) {
        // create a string list from row in file
        List<String> thisRow = Arrays.asList(it.nextLine().split(Converter.VALUE_DELIMITER));
        // is this the header file?
        if (processHeader) {
          // look for MAF file case id column header
          mafCaseIDColumnIndex = thisRow.indexOf(Converter.MUTATION_CASE_ID_COLUMN_HEADER);
          // this is not a MAF file, header contains the case ids, return here
          if (mafCaseIDColumnIndex == -1) {
            for (String potentialCaseID : thisRow) {
              if (caseIDs.isTumorCaseID(potentialCaseID)) {
                caseSet.add(caseIDs.convertCaseID(potentialCaseID));
              }
            }
            break;
          }
          processHeader = false;
          continue;
        }
        // we want to add the value at mafCaseIDColumnIndex into return set - this is a case ID
        String potentialCaseID = thisRow.get(mafCaseIDColumnIndex);
        if (caseIDs.isTumorCaseID(potentialCaseID)) {
          caseSet.add(caseIDs.convertCaseID(potentialCaseID));
        }
      }
    } finally {
      it.close();
    }

    // outta here
    return new ArrayList<String>(caseSet);
  }