public DoubleMatrix getScoreMatrix(File file) { Counter<String> docWords = new Counter<String>(); try { LineIterator iter = FileUtils.lineIterator(file); while (iter.hasNext()) { Tokenizer t = tokenizerFactory.create((new InputHomogenization(iter.nextLine()).transform())); while (t.hasMoreTokens()) { docWords.incrementCount(t.nextToken(), 1.0); } } iter.close(); } catch (IOException e) { throw new IllegalStateException("Unable to read file", e); } DoubleMatrix ret = new DoubleMatrix(1, currVocab.size()); for (int i = 0; i < currVocab.size(); i++) { if (docWords.getCount(currVocab.get(i).toString()) > 0) { ret.put(i, wordScores.getCount(currVocab.get(i).toString())); } } return ret; }
@Override protected void setUp() throws Exception { inMemorySenseiService = InMemorySenseiService.valueOf( new File( InMemoryIndexPerfTest.class .getClassLoader() .getResource("test-conf/node1/") .toURI())); LineIterator lineIterator = FileUtils.lineIterator( new File( InMemoryIndexPerfTest.class .getClassLoader() .getResource("data/test_data.json") .toURI())); int i = 0; docs = new ArrayList<JSONObject>(); while (lineIterator.hasNext() && i < 100) { String car = lineIterator.next(); if (car != null && car.contains("{")) docs.add(new JSONObject(car)); i++; } lineIterator.close(); }
public static void main(String[] args) throws IOException { String workDir = "E:/dev_workspace/tmp/workspace/duc2007"; String idfFilename = "duc2007.idf"; final double TOTAL_PAGE_COUNT = 30000000000.0D; Map<String, Double> idfValues = new HashMap<String, Double>(); File idfFIle = FileUtils.getFile(workDir + "/" + DIR_IDF_FILE, idfFilename); log.info("Loading idf value file[" + idfFIle.getAbsolutePath() + "]"); LineIterator lineIterator = null; try { lineIterator = FileUtils.lineIterator(idfFIle, DEFAULT_CHARSET.toString()); while (lineIterator.hasNext()) { String line = lineIterator.nextLine(); String[] strs = line.split("###"); if (strs.length != 2) { log.warn("Line[" + line + "] format is illegal, ignore it!"); continue; } idfValues.put(strs[0].trim(), Long.parseLong(strs[1]) / TOTAL_PAGE_COUNT); } log.info("Load idf value file[" + idfFIle.getAbsolutePath() + "] finished!"); } catch (IOException e) { log.error("Load idf value file[" + idfFIle.getAbsolutePath() + "] error!", e); throw e; } finally { if (lineIterator != null) { lineIterator.close(); } } String question = "Describe the legal battle between various recording artists and members of the record industry and the Internet music site Napster. What support, or lack thereof, have the litigants received?"; EhCacheUtil ehCacheUtil = new EhCacheUtil("db_cache_vec", "lab"); SummaryBuilderByVector summaryBuilder = new SummaryBuilderByVector( workDir, "0", "D0714D.txt", 10, idfValues, question, ehCacheUtil, 1.0f, 1.6f); ExecutorService es = Executors.newSingleThreadExecutor(); Future<Boolean> future = es.submit(summaryBuilder); try { future.get(); } catch (InterruptedException | ExecutionException e) { e.printStackTrace(); } es.shutdown(); EhCacheUtil.close(); }
/** 打印帮助信息 */ private static void showHelpInfo() { String helpfile = System.getProperty("user.dir") + File.separator + "conf" + File.separator + "help.info"; File f = new File(helpfile); if (!f.exists()) { System.out.println("help.info not exists"); } else { try { LineIterator itr = FileUtils.lineIterator(f, "UTF-8"); while (itr.hasNext()) { System.out.println(itr.nextLine()); } itr.close(); } catch (IOException e) { e.printStackTrace(); } } }
/** * Creates a hash code from the source code of the warning line and the surrounding context. * * @param fileName the absolute path of the file to read * @param line the line of the warning * @param encoding the encoding of the file, if <code>null</code> or empty then the default * encoding of the platform is used * @return a has code of the source code * @throws IOException if the contents of the file could not be read */ public int create(final String fileName, final int line, final String encoding) throws IOException { LineIterator lineIterator = EncodingValidator.readFile(fileName, encoding); StringBuilder context = new StringBuilder(1000); for (int i = 0; lineIterator.hasNext(); i++) { String currentLine = lineIterator.nextLine(); if (i >= line - 3) { context.append(currentLine); } if (i > line + 3) { break; } } lineIterator.close(); return context.toString().hashCode(); }
/** * Loads an in memory cache from the given path (sets syn0 and the vocab) * * @param vectorsFile the path of the file to load * @return * @throws FileNotFoundException */ public static Pair<InMemoryLookupTable, VocabCache> loadTxt(File vectorsFile) throws FileNotFoundException { BufferedReader write = new BufferedReader(new FileReader(vectorsFile)); VocabCache cache = new InMemoryLookupCache(); InMemoryLookupTable lookupTable; LineIterator iter = IOUtils.lineIterator(write); List<INDArray> arrays = new ArrayList<>(); while (iter.hasNext()) { String line = iter.nextLine(); String[] split = line.split(" "); String word = split[0]; VocabWord word1 = new VocabWord(1.0, word); cache.addToken(word1); cache.addWordToIndex(cache.numWords(), word); word1.setIndex(cache.numWords()); cache.putVocabWord(word); INDArray row = Nd4j.create(Nd4j.createBuffer(split.length - 1)); for (int i = 1; i < split.length; i++) { row.putScalar(i - 1, Float.parseFloat(split[i])); } arrays.add(row); } INDArray syn = Nd4j.create(new int[] {arrays.size(), arrays.get(0).columns()}); for (int i = 0; i < syn.rows(); i++) { syn.putRow(i, arrays.get(i)); } lookupTable = (InMemoryLookupTable) new InMemoryLookupTable.Builder() .vectorLength(arrays.get(0).columns()) .useAdaGrad(false) .cache(cache) .build(); Nd4j.clearNans(syn); lookupTable.setSyn0(syn); iter.close(); return new Pair<>(lookupTable, cache); }
public void load() throws IOException { log.info("Loading lexicon..."); File dataFile = new File("data/lexicon.txt.gz"); Reader reader = new BufferedReader( new InputStreamReader(new GZIPInputStream(new FileInputStream(dataFile)))); LineIterator iterator = IOUtils.lineIterator(reader); while (iterator.hasNext()) { String line = iterator.nextLine(); String[] splits = line.split("\\s"); for (int x = 1; x < splits.length; ++x) { POSTag tag = POSTag.fromString(splits[x]); if (tag == null) log.warn("Unknown tag: {0}", splits[x]); else lexiconMap.put(splits[0], tag); } } iterator.close(); log.info("Lexicon loaded!"); }
private TitleNameNormalizer(String pathToEvaluationRedirectsData) throws IOException { if (useBloomFilter) { redirectFilter = BloomFilter.create(Funnels.stringFunnel(), ESTIMATED_REDIRECTS); redirects = new LRUCache<String, String>(5000) { protected String loadValue(String src) { String normalized = TitleNameIndexer.normalize(src); if (normalized == null) return src; return TitleNameIndexer.normalize(src); } }; } else redirects = new StringMap<String>(); if (showInitProgress) System.out.println( "Loading the most recent redirect pages from Wikipedia to normalize the output links to the latest version"); if (pathToEvaluationRedirectsData != null) { InputStream is = CompressionUtils.readSnappyCompressed(pathToEvaluationRedirectsData); LineIterator iterator = IOUtils.lineIterator(is, StandardCharsets.UTF_8); long linecount = 0; while (iterator.hasNext()) { String line = iterator.nextLine(); if (showInitProgress && linecount++ % 100000 == 0) System.out.println("loading the latest redirects; linecount=" + linecount); String[] parts = StringUtils.split(line, '\t'); String src = parts[0].trim().replace(' ', '_'); String trg = parts[1].trim().replace(' ', '_'); if (useBloomFilter) redirectFilter.put(src); else redirects.put(src, trg); } iterator.close(); } redirects = Collections.unmodifiableMap(redirects); if (showInitProgress) System.out.println( "Done - Loading the most recent redirect pages from Wikipedia to normalize the output links to the latest version"); }
@Override protected void doClose() throws Exception { lineIterator.close(); }
/** * Runs a MAF file through the Oncotator and OMA tools. * * @param inputMAFURL String * @param outputMAFURL String * @throws Exception */ @Override public void oncotateMAF(String inputMAFURL, String outputMAFURL) throws Exception { // sanity check if (inputMAFURL == null || inputMAFURL.length() == 0 || outputMAFURL == null || outputMAFURL.length() == 0) { throw new IllegalArgumentException( "oncotateMAFdownloadFile(): url or urlDestination argument is null..."); } URL inputMAF = new URL(inputMAFURL); URL outputMAF = new URL(outputMAFURL); // determine if we have to call liftover boolean cleanOncotatorInputFile = false; File oncotatorInputFile = new File(inputMAF.getFile()); org.apache.commons.io.LineIterator it = org.apache.commons.io.FileUtils.lineIterator(oncotatorInputFile); it.nextLine(); // skip header String[] parts = it.nextLine().split("\t"); if (parts[3].contains("36") || parts[3].equals("hg18")) { it.close(); File liftoverInputFile = org.apache.commons.io.FileUtils.getFile( org.apache.commons.io.FileUtils.getTempDirectory(), "liftoverInputFile"); org.apache.commons.io.FileUtils.copyFile(oncotatorInputFile, liftoverInputFile); oncotatorInputFile = new File(inputMAF.getFile()); // call lift over if (LOG.isInfoEnabled()) { LOG.info("oncotateMAF(), calling Hg18ToHg19..."); } Hg18ToHg19.driver( liftoverInputFile.getCanonicalPath(), oncotatorInputFile.getCanonicalPath(), getLiftOverBinary(), getLiftOverChain()); org.apache.commons.io.FileUtils.forceDelete(liftoverInputFile); cleanOncotatorInputFile = true; } // create a temp output file from the oncotator File oncotatorOutputFile = org.apache.commons.io.FileUtils.getFile( org.apache.commons.io.FileUtils.getTempDirectory(), "oncotatorOutputFile"); // call oncotator if (LOG.isInfoEnabled()) { LOG.info("oncotateMAF(), calling OncotateTool..."); } OncotateTool.driver( oncotatorInputFile.getCanonicalPath(), oncotatorOutputFile.getCanonicalPath(), true, true, true); // we call OMA here - // we use output from oncotator as input file if (LOG.isInfoEnabled()) { LOG.info("oncotateMAF(), calling MutationAssessorTool..."); } File outputMAFFile = new File(outputMAF.getFile()); outputMAFFile.createNewFile(); MutationAssessorTool.driver( oncotatorOutputFile.getCanonicalPath(), outputMAFFile.getCanonicalPath(), false, true, true); // clean up org.apache.commons.io.FileUtils.forceDelete(oncotatorOutputFile); if (cleanOncotatorInputFile) org.apache.commons.io.FileUtils.forceDelete(oncotatorInputFile); }
/** * Get the case list from the staging file. * * @param caseIDs CaseIDs; * @param portalMetadata PortalMetadata * @param cancerStudyMetadata CancerStudyMetadata * @param stagingFilename String * @return List<String> * @throws Exception */ @Override public List<String> getCaseListFromStagingFile( CaseIDs caseIDs, PortalMetadata portalMetadata, CancerStudyMetadata cancerStudyMetadata, String stagingFilename) throws Exception { if (LOG.isInfoEnabled()) { LOG.info("getCaseListFromStagingFile(): " + stagingFilename); } // we use set here HashSet<String> caseSet = new HashSet<String>(); // staging file File stagingFile = org.apache.commons.io.FileUtils.getFile( portalMetadata.getStagingDirectory(), cancerStudyMetadata.getStudyPath(), stagingFilename); // sanity check if (!stagingFile.exists()) { return new ArrayList<String>(); } // iterate over all rows in file org.apache.commons.io.LineIterator it = org.apache.commons.io.FileUtils.lineIterator(stagingFile); try { int mafCaseIDColumnIndex = 0; boolean processHeader = true; while (it.hasNext()) { // create a string list from row in file List<String> thisRow = Arrays.asList(it.nextLine().split(Converter.VALUE_DELIMITER)); // is this the header file? if (processHeader) { // look for MAF file case id column header mafCaseIDColumnIndex = thisRow.indexOf(Converter.MUTATION_CASE_ID_COLUMN_HEADER); // this is not a MAF file, header contains the case ids, return here if (mafCaseIDColumnIndex == -1) { for (String potentialCaseID : thisRow) { if (caseIDs.isTumorCaseID(potentialCaseID)) { caseSet.add(caseIDs.convertCaseID(potentialCaseID)); } } break; } processHeader = false; continue; } // we want to add the value at mafCaseIDColumnIndex into return set - this is a case ID String potentialCaseID = thisRow.get(mafCaseIDColumnIndex); if (caseIDs.isTumorCaseID(potentialCaseID)) { caseSet.add(caseIDs.convertCaseID(potentialCaseID)); } } } finally { it.close(); } // outta here return new ArrayList<String>(caseSet); }