예제 #1
0
  /** csv格式 */
  @Test
  public void testImportCsv() throws IOException {

    long beginTime = System.currentTimeMillis();

    File file = new File("D:\\Backup\\test.csv");
    InputStream is = new BufferedInputStream(new FileInputStream(file));
    String encoding = FileCharset.getCharset(file);

    LineIterator iterator = IOUtils.lineIterator(is, encoding);

    String separator = ",";
    int batchSize = 100; // 批处理大小
    int totalSize = 0; // 总大小

    final List<ExcelData> dataList = Lists.newArrayList();

    if (iterator.hasNext()) {
      iterator.nextLine(); // 跳过第一行标题
    }

    while (iterator.hasNext()) {

      totalSize++;

      String line = iterator.nextLine();
      String[] dataArray = StringUtils.split(line, separator);

      ExcelData data = new ExcelData();
      data.setId(Long.valueOf(dataArray[0]));
      data.setContent(dataArray[1]);
      dataList.add(data);

      if (totalSize % batchSize == 0) {
        try {
          doBatchSave(dataList);
        } catch (Exception e) {
          Long fromId = dataList.get(0).getId();
          Long endId = dataList.get(dataList.size() - 1).getId();
          log.error("from " + fromId + " to " + endId + ", error", e);
        }
        dataList.clear();
      }
    }
    IOUtils.closeQuietly(is);

    long endTime = System.currentTimeMillis();
    log.info("耗时(秒):" + (endTime - beginTime) / 1000);
  }
예제 #2
0
  public DoubleMatrix getScoreMatrix(File file) {
    Counter<String> docWords = new Counter<String>();
    try {
      LineIterator iter = FileUtils.lineIterator(file);
      while (iter.hasNext()) {
        Tokenizer t =
            tokenizerFactory.create((new InputHomogenization(iter.nextLine()).transform()));
        while (t.hasMoreTokens()) {
          docWords.incrementCount(t.nextToken(), 1.0);
        }
      }

      iter.close();
    } catch (IOException e) {
      throw new IllegalStateException("Unable to read file", e);
    }
    DoubleMatrix ret = new DoubleMatrix(1, currVocab.size());

    for (int i = 0; i < currVocab.size(); i++) {
      if (docWords.getCount(currVocab.get(i).toString()) > 0) {
        ret.put(i, wordScores.getCount(currVocab.get(i).toString()));
      }
    }

    return ret;
  }
  /**
   * All files containing serialization policy are located during construction of this object.
   * Serialization policies are loaded from them (and cached) as needed.
   *
   * @param servletContext
   * @throws IOException
   */
  @Autowired(required = false)
  public MultiModuleSerializationPolicyProvider(ServletContext servletContext) throws IOException {
    for (File rpcPolicyManifest : listRpcPolicyManifestFiles(servletContext.getRealPath("/"))) {
      File moduleDir = rpcPolicyManifest.getParentFile().getParentFile();
      LineIterator entries = FileUtils.lineIterator(rpcPolicyManifest);

      while (entries.hasNext()) {
        String line = entries.nextLine();
        if (line.startsWith("#") || line.trim().length() == 0) continue;

        String[] entry = line.split(",");
        assert entry.length == 2 : "Invalid format of file: " + rpcPolicyManifest.getAbsolutePath();
        String rpcServiceInterfaceName = entry[0].trim();
        String rpcPolicyStrongFileName = entry[1].trim();

        if (serializationPolicyFiles.containsKey(rpcServiceInterfaceName)) {
          assert serializationPolicyFiles
              .get(rpcServiceInterfaceName)
              .getName()
              .equals(rpcPolicyStrongFileName);
        } else {
          File serializationPolicyFile = new File(moduleDir, rpcPolicyStrongFileName);
          assert serializationPolicyFile.exists();
          serializationPolicyFiles.put(rpcServiceInterfaceName, serializationPolicyFile);
        }
      }

      LineIterator.closeQuietly(entries);
    }
  }
  // For example, here is a line from the 5kb chr1 MAPQGE30 raw observed contact matrix
  // (GM12878_combined/5kb_resolution_intrachromosomal/chr1/MAPQGE30/chr1_5kb.RAWobserved):
  // 40000000 40100000 59.0
  private static void processRawContactInformation(
      String fileToRead,
      double minValue,
      ArrayList<DesiredChrContact> contactsToCheck,
      boolean intra)
      throws IOException {

    // Check if sorted version is available
    // If not make sorted available.
    if (!Gpio.exists(fileToRead + ".sorted")) {
      if (intra) {
        umcg.genetica.io.chrContacts.SortIntraChrContacts.readNonSortedWriteSorted(
            fileToRead, fileToRead + ".sorted");
      } else {
        umcg.genetica.io.chrContacts.SortInterChrContacts.readNonSortedWriteSorted(
            fileToRead, fileToRead + ".sorted");
      }
    }

    int numberToBeMatched = 0;

    LineIterator it = FileUtils.lineIterator(new File(fileToRead + ".sorted"), "UTF-8");

    try {
      while (it.hasNext()) {
        String[] parts = StringUtils.split(it.nextLine(), '\t');

        int posChr1 = org.apache.commons.lang.math.NumberUtils.createInteger(parts[0]);
        int posChr2 = org.apache.commons.lang.math.NumberUtils.createInteger(parts[1]);

        while (numberToBeMatched < contactsToCheck.size()) {
          if (posChr1 < contactsToCheck.get(numberToBeMatched).getChrLocationSmaller()) {
            break;
          } else if (posChr1 == contactsToCheck.get(numberToBeMatched).getChrLocationSmaller()) {
            if (posChr2 < contactsToCheck.get(numberToBeMatched).getChrLocationLarger()) {
              break;
            }
            if (posChr2 == contactsToCheck.get(numberToBeMatched).getChrLocationLarger()) {
              double contact = org.apache.commons.lang.math.NumberUtils.createDouble(parts[2]);
              if (contact >= minValue) {
                contactsToCheck.get(numberToBeMatched).setContact();
                numberToBeMatched++;
              } else {
                numberToBeMatched++;
              }
            } else if (posChr2 > contactsToCheck.get(numberToBeMatched).getChrLocationLarger()) {
              numberToBeMatched++;
            }
          } else if (posChr1 > contactsToCheck.get(numberToBeMatched).getChrLocationSmaller()) {
            numberToBeMatched++;
          }
        }
      }
    } finally {
      LineIterator.closeQuietly(it);
    }
  }
  /**
   * Helper function to create DataMatrix.
   *
   * @param data InputStream
   * @return DataMatrix
   */
  private DataMatrix getDataMatrix(InputStream data) throws Exception {

    // iterate over all lines in byte[]
    List<String> columnNames = null;
    List<LinkedList<String>> rowData = null;
    LineIterator it = IOUtils.lineIterator(data, null);
    try {
      int count = -1;
      while (it.hasNext()) {
        // first row is our column heading, create column vector
        if (++count == 0) {
          columnNames =
              new LinkedList(Arrays.asList(it.nextLine().split(Converter.VALUE_DELIMITER, -1)));
        }
        // all other rows are rows in the table
        else {
          rowData = (rowData == null) ? new LinkedList<LinkedList<String>>() : rowData;
          rowData.add(
              new LinkedList(Arrays.asList(it.nextLine().split(Converter.VALUE_DELIMITER, -1))));
        }
      }
    } finally {
      LineIterator.closeQuietly(it);
    }

    // problem reading from data?
    if (columnNames == null || rowData == null) {
      if (LOG.isInfoEnabled()) {
        LOG.info(
            "getDataMatrix(), problem creating DataMatrix from file, data file probably missing data, returning null");
      }
      return null;
    }

    // made it here, we can create DataMatrix
    if (LOG.isInfoEnabled()) {
      LOG.info("creating new DataMatrix(), from file data");
    }

    // outta here
    return new DataMatrix(rowData, columnNames);
  }
예제 #6
0
 public Map<String, Object> next() {
   Map<String, Object> map = null;
   do {
     String line = delegate.nextLine();
     try {
       map = mapper.readValue(line, Map.class);
       return map;
     } catch (IOException e) {
     }
   } while (map == null && delegate.hasNext());
   throw new NoSuchElementException();
 }
  public static void main(String[] args) throws IOException {

    String workDir = "E:/dev_workspace/tmp/workspace/duc2007";
    String idfFilename = "duc2007.idf";

    final double TOTAL_PAGE_COUNT = 30000000000.0D;

    Map<String, Double> idfValues = new HashMap<String, Double>();
    File idfFIle = FileUtils.getFile(workDir + "/" + DIR_IDF_FILE, idfFilename);
    log.info("Loading idf value file[" + idfFIle.getAbsolutePath() + "]");
    LineIterator lineIterator = null;
    try {
      lineIterator = FileUtils.lineIterator(idfFIle, DEFAULT_CHARSET.toString());
      while (lineIterator.hasNext()) {
        String line = lineIterator.nextLine();
        String[] strs = line.split("###");
        if (strs.length != 2) {
          log.warn("Line[" + line + "] format is illegal, ignore it!");
          continue;
        }
        idfValues.put(strs[0].trim(), Long.parseLong(strs[1]) / TOTAL_PAGE_COUNT);
      }
      log.info("Load idf value file[" + idfFIle.getAbsolutePath() + "] finished!");
    } catch (IOException e) {
      log.error("Load idf value file[" + idfFIle.getAbsolutePath() + "] error!", e);
      throw e;
    } finally {
      if (lineIterator != null) {
        lineIterator.close();
      }
    }

    String question =
        "Describe the legal battle between various recording artists and members of the record industry and the Internet music site Napster. What support, or lack thereof, have the litigants received?";

    EhCacheUtil ehCacheUtil = new EhCacheUtil("db_cache_vec", "lab");

    SummaryBuilderByVector summaryBuilder =
        new SummaryBuilderByVector(
            workDir, "0", "D0714D.txt", 10, idfValues, question, ehCacheUtil, 1.0f, 1.6f);
    ExecutorService es = Executors.newSingleThreadExecutor();
    Future<Boolean> future = es.submit(summaryBuilder);
    try {
      future.get();
    } catch (InterruptedException | ExecutionException e) {
      e.printStackTrace();
    }
    es.shutdown();
    EhCacheUtil.close();
  }
  protected List<DiagnosGrupp> getDiagnosGrupperInternal(Resource resource) throws IOException {
    LineIterator it = FileUtils.lineIterator(resource.getFile(), "UTF-8");

    List<DiagnosGrupp> list = new ArrayList<>();
    try {

      while (it.hasNext()) {
        String line = it.nextLine();
        list.add(new DiagnosGrupp(line));
      }
    } finally {
      LineIterator.closeQuietly(it);
    }
    return list;
  }
예제 #9
0
 /** 打印帮助信息 */
 private static void showHelpInfo() {
   String helpfile =
       System.getProperty("user.dir") + File.separator + "conf" + File.separator + "help.info";
   File f = new File(helpfile);
   if (!f.exists()) {
     System.out.println("help.info not exists");
   } else {
     try {
       LineIterator itr = FileUtils.lineIterator(f, "UTF-8");
       while (itr.hasNext()) {
         System.out.println(itr.nextLine());
       }
       itr.close();
     } catch (IOException e) {
       e.printStackTrace();
     }
   }
 }
예제 #10
0
  /**
   * Creates a hash code from the source code of the warning line and the surrounding context.
   *
   * @param fileName the absolute path of the file to read
   * @param line the line of the warning
   * @param encoding the encoding of the file, if <code>null</code> or empty then the default
   *     encoding of the platform is used
   * @return a has code of the source code
   * @throws IOException if the contents of the file could not be read
   */
  public int create(final String fileName, final int line, final String encoding)
      throws IOException {
    LineIterator lineIterator = EncodingValidator.readFile(fileName, encoding);

    StringBuilder context = new StringBuilder(1000);
    for (int i = 0; lineIterator.hasNext(); i++) {
      String currentLine = lineIterator.nextLine();
      if (i >= line - 3) {
        context.append(currentLine);
      }
      if (i > line + 3) {
        break;
      }
    }
    lineIterator.close();

    return context.toString().hashCode();
  }
  /**
   * Loads an in memory cache from the given path (sets syn0 and the vocab)
   *
   * @param vectorsFile the path of the file to load
   * @return
   * @throws FileNotFoundException
   */
  public static Pair<InMemoryLookupTable, VocabCache> loadTxt(File vectorsFile)
      throws FileNotFoundException {
    BufferedReader write = new BufferedReader(new FileReader(vectorsFile));
    VocabCache cache = new InMemoryLookupCache();

    InMemoryLookupTable lookupTable;

    LineIterator iter = IOUtils.lineIterator(write);
    List<INDArray> arrays = new ArrayList<>();
    while (iter.hasNext()) {
      String line = iter.nextLine();
      String[] split = line.split(" ");
      String word = split[0];
      VocabWord word1 = new VocabWord(1.0, word);
      cache.addToken(word1);
      cache.addWordToIndex(cache.numWords(), word);
      word1.setIndex(cache.numWords());
      cache.putVocabWord(word);
      INDArray row = Nd4j.create(Nd4j.createBuffer(split.length - 1));
      for (int i = 1; i < split.length; i++) {
        row.putScalar(i - 1, Float.parseFloat(split[i]));
      }
      arrays.add(row);
    }

    INDArray syn = Nd4j.create(new int[] {arrays.size(), arrays.get(0).columns()});
    for (int i = 0; i < syn.rows(); i++) {
      syn.putRow(i, arrays.get(i));
    }

    lookupTable =
        (InMemoryLookupTable)
            new InMemoryLookupTable.Builder()
                .vectorLength(arrays.get(0).columns())
                .useAdaGrad(false)
                .cache(cache)
                .build();
    Nd4j.clearNans(syn);
    lookupTable.setSyn0(syn);

    iter.close();

    return new Pair<>(lookupTable, cache);
  }
예제 #12
0
 @TestData
 @Provides
 Map<String, String> provideTestData() {
   ClassLoader cl = Thread.currentThread().getContextClassLoader();
   try (InputStream is = cl.getResourceAsStream("testdata/fibonacci.txt")) {
     Map<String, String> result = newHashMapWithExpectedSize(20);
     for (LineIterator it = lineIterator(is, "UTF-8"); it.hasNext(); ) {
       String line = it.nextLine();
       if (line.startsWith("#")) {
         continue;
       }
       String[] columns = line.split(";");
       result.put(columns[0], columns[1]);
     }
     return result;
   } catch (IOException ex) {
     throw new IllegalStateException("Error reading test data.", ex);
   }
 }
예제 #13
0
파일: FileUtility.java 프로젝트: ooz/EDLed
  public static List<String> lines(final File file) {
    List<String> lines = new LinkedList<String>();

    if (file.exists()) {
      LineIterator it = null;
      try {
        it = FileUtils.lineIterator(file, "UTF-8");
        while (it.hasNext()) {
          lines.add(it.nextLine());
        }
      } catch (IOException e) {
        logger.warn("I/O error with file " + file.getPath(), e);
      } finally {
        LineIterator.closeQuietly(it);
      }
    }

    return lines;
  }
예제 #14
0
  public void update(File fPathInfo) {
    try {
      updateinfo = new Hashtable<String, Long[]>();
      LineIterator lineIterator = FileUtils.lineIterator(fPathInfo);
      while (lineIterator.hasNext()) {
        String line = lineIterator.nextLine();
        String name = line;
        long crc = 0;
        if (line.indexOf("----------") > 0) {
          name = line.substring(0, line.indexOf("----------"));
          crc = Long.parseLong(line.substring(line.indexOf("----------") + "----------".length()));
        }
        updateinfo.put(name, new Long[] {-1l, crc});
      }
      System.out.println("updateinfo " + updateinfo.size());

      listDir(new File(home), 2);

      System.out.println("updateinfo " + updateinfo.size());

      File foUpdate = new File(home + "\\Update.zip");
      if (foUpdate.exists()) {
        foUpdate.delete();
      }
      ZipOutputStream out = new ZipOutputStream(new FileOutputStream(home + "\\Update.zip"));
      StringBuffer sb = new StringBuffer();

      for (String fname : updateinfo.keySet()) {
        Long[] status = updateinfo.get(fname);
        if (status[0] == -1) {
          sb.append("delete " + fname + "\n");
        } else if (status[0] == 1) {
          out.putNextEntry(new ZipEntry(fname));
          IOUtils.copy(new FileInputStream(fname), out);
        }
      }
      IOUtils.closeQuietly(out);
      FileUtils.writeStringToFile(new File("del.bat"), sb.toString());
    } catch (IOException e) {
      e.printStackTrace(System.err);
    }
  }
  /**
   * Load a look up cache from an input stream delimited by \n
   *
   * @param from the input stream to read from
   * @return the in memory lookup cache
   */
  public static InMemoryLookupCache load(InputStream from) {
    Reader inputStream = new InputStreamReader(from);
    LineIterator iter = IOUtils.lineIterator(inputStream);
    String line;
    InMemoryLookupCache ret = new InMemoryLookupCache();
    int count = 0;
    while ((iter.hasNext())) {
      line = iter.nextLine();
      if (line.isEmpty()) continue;
      ret.incrementWordCount(line);
      VocabWord word = new VocabWord(1.0, line);
      word.setIndex(count);
      ret.addToken(word);
      ret.addWordToIndex(count, line);
      ret.putVocabWord(line);
      count++;
    }

    return ret;
  }
  /**
   * Reads headers from the batch starting from the given position.
   *
   * <p>Retrieved headers will be added to the map given by target parameter.
   *
   * @param iterator batch iterator.
   * @param target destination of the retrieved headers.
   */
  public static void readHeaders(
      final ODataBatchLineIterator iterator, final Map<String, Collection<String>> target) {

    try {
      final ByteArrayOutputStream baos = new ByteArrayOutputStream();
      readBatchPart(new ODataBatchController(iterator, null), baos, true);

      final LineIterator headers =
          IOUtils.lineIterator(new ByteArrayInputStream(baos.toByteArray()), Constants.UTF8);
      while (headers.hasNext()) {
        final String line = headers.nextLine().trim();
        if (StringUtils.isNotBlank(line)) {
          addHeaderLine(line, target);
        }
      }
    } catch (Exception e) {
      LOG.error("Error retrieving headers", e);
      throw new IllegalStateException(e);
    }
  }
예제 #17
0
  @Override
  public List<Route> parseRoutes(File file) throws IOException {
    LineIterator it = FileUtils.lineIterator(file, "UTF-8");
    List<Route> routes = new LinkedList<Route>();

    Route currentRoute = null;
    while (it.hasNext()) {
      String line = it.nextLine();
      if (line.trim().isEmpty()) {
        currentRoute = null;
      } else if (line.startsWith("  ")) {
        loadModelEntryForRoute(currentRoute, line);
      } else {
        currentRoute = parseLine(line.trim());
        if (currentRoute != null) {
          routes.add(currentRoute);
        }
      }
    }
    return routes;
  }
예제 #18
0
  public void load() throws IOException {
    log.info("Loading lexicon...");
    File dataFile = new File("data/lexicon.txt.gz");
    Reader reader =
        new BufferedReader(
            new InputStreamReader(new GZIPInputStream(new FileInputStream(dataFile))));

    LineIterator iterator = IOUtils.lineIterator(reader);

    while (iterator.hasNext()) {
      String line = iterator.nextLine();
      String[] splits = line.split("\\s");
      for (int x = 1; x < splits.length; ++x) {
        POSTag tag = POSTag.fromString(splits[x]);
        if (tag == null) log.warn("Unknown tag: {0}", splits[x]);
        else lexiconMap.put(splits[0], tag);
      }
    }

    iterator.close();
    log.info("Lexicon loaded!");
  }
  private TitleNameNormalizer(String pathToEvaluationRedirectsData) throws IOException {

    if (useBloomFilter) {
      redirectFilter = BloomFilter.create(Funnels.stringFunnel(), ESTIMATED_REDIRECTS);
      redirects =
          new LRUCache<String, String>(5000) {
            protected String loadValue(String src) {
              String normalized = TitleNameIndexer.normalize(src);
              if (normalized == null) return src;
              return TitleNameIndexer.normalize(src);
            }
          };
    } else redirects = new StringMap<String>();
    if (showInitProgress)
      System.out.println(
          "Loading the most recent redirect pages from Wikipedia to normalize the output links to the latest version");
    if (pathToEvaluationRedirectsData != null) {
      InputStream is = CompressionUtils.readSnappyCompressed(pathToEvaluationRedirectsData);
      LineIterator iterator = IOUtils.lineIterator(is, StandardCharsets.UTF_8);

      long linecount = 0;
      while (iterator.hasNext()) {
        String line = iterator.nextLine();
        if (showInitProgress && linecount++ % 100000 == 0)
          System.out.println("loading the latest redirects; linecount=" + linecount);
        String[] parts = StringUtils.split(line, '\t');

        String src = parts[0].trim().replace(' ', '_');
        String trg = parts[1].trim().replace(' ', '_');
        if (useBloomFilter) redirectFilter.put(src);
        else redirects.put(src, trg);
      }
      iterator.close();
    }
    redirects = Collections.unmodifiableMap(redirects);
    if (showInitProgress)
      System.out.println(
          "Done  - Loading the most recent redirect pages from Wikipedia to normalize the output links to the latest version");
  }
  /**
   * Reads the precomputed md5 digest out of a .md5 file (firehose). Assume the file only contains
   * one line wit checksum.
   *
   * @param file File
   * @return String
   * @throws Exception
   */
  @Override
  public String getPrecomputedMD5Digest(File file) throws Exception {

    if (LOG.isInfoEnabled()) {
      LOG.info("getPrecomputedMD5Digest(): " + file.getCanonicalPath());
    }

    String toReturn = "";
    LineIterator it = org.apache.commons.io.FileUtils.lineIterator(file);
    try {
      while (it.hasNext()) {
        String content = it.nextLine();
        if (content.split(" ").length == 2) {
          toReturn = content.split(" ")[0].toUpperCase();
        }
      }
    } finally {
      LineIterator.closeQuietly(it);
    }

    // outta here
    return toReturn;
  }
 @Override
 public List<Date> read(java.io.Reader reader, String separator) throws Exception {
   List<Date> dates = new ArrayList<Date>();
   try {
     SimpleDateFormat sdf = new SimpleDateFormat(FORMATO_DATA);
     sdf.setTimeZone(TimeZone.getTimeZone("GMT"));
     LineIterator it = IOUtils.lineIterator(reader);
     while (it.hasNext()) {
       String[] line = it.nextLine().split(separator);
       for (int i = 0; i < line.length; i++) {
         if (i < (line.length - 1)) {
           try {
             dates.add(sdf.parse(line[i]));
           } catch (ParseException e) {
             break;
           }
         }
       }
     }
   } finally {
     IOUtils.closeQuietly(reader);
   }
   return dates;
 }
예제 #22
0
 /**
  * Copies the next line of the input to the output.
  *
  * @param output output
  * @param lineIterator input
  */
 private void copyLine(final StringBuilder output, final LineIterator lineIterator) {
   output.append(lineIterator.nextLine());
   output.append("\n");
 }
  /**
   * 加载压缩后的句子,按类别组织
   *
   * @param count: 每个类别下选取的句子数量
   * @return
   * @throws IOException
   */
  private Map<String, ClustItem> loadSentences(int count) throws IOException {

    Map<String, ClustItem> clustedSentences = new HashMap<String, ClustItem>();

    Pattern pattern = Pattern.compile("(classes_\\d+):");

    try {
      log.info(
          "Loading msc file["
              + this.workDir
              + "/"
              + GlobalConstant.DIR_SENTENCES_COMPRESSION
              + "/"
              + this.filename
              + "]");
      LineIterator lineIterator =
          FileUtils.lineIterator(
              FileUtils.getFile(
                  this.workDir + '/' + GlobalConstant.DIR_SENTENCES_COMPRESSION, this.filename),
              GlobalConstant.DEFAULT_CHARSET.toString());

      String currentKey = "";
      int sentCount = 0; // 存储当前选择的句子数
      int totalCount = 0; // 总句子数
      while (lineIterator.hasNext()) {
        String line = lineIterator.nextLine();
        Matcher matcher = pattern.matcher(line);
        if (matcher.find()) {
          // 当前为classes_
          currentKey = matcher.group(1);
          ClustItem clustItem = new ClustItem();
          clustItem.setName(currentKey);
          clustedSentences.put(currentKey, clustItem);
          totalCount += sentCount;
          sentCount = 0;
        } else {
          ClustItem ci = clustedSentences.get(currentKey);
          ci.setSize(ci.getSize() + 1);
          if (sentCount > count) {
            continue;
          }
          List<Pair<Float, String>> sentences = ci.getSentences();
          if (null == sentences) {
            sentences = new ArrayList<Pair<Float, String>>();
            ci.setSentences(sentences);
          }
          // 将score#sentence转换成(score, sentence)
          int flagNum = line.indexOf("#");
          sentences.add(
              new Pair<Float, String>(
                  Float.parseFloat(line.substring(0, flagNum)), line.substring(flagNum + 1)));
          ++sentCount;
        }
      }

      log.info("Load msc file finished[sentence count:" + totalCount + "]");

    } catch (IOException e) {
      log.error(
          "Load msc file["
              + this.workDir
              + "/"
              + GlobalConstant.DIR_SENTENCES_COMPRESSION
              + "/"
              + this.filename
              + "] error!",
          e);
      throw e;
    }

    return clustedSentences;
  }
  /**
   * Parses the user item rating data and stores them into collections.
   *
   * @param filePath
   */
  @SuppressWarnings("unused")
  private void parseDataIntoItemUserMatrix(String filePath) {
    File file = new File(filePath);

    int i = 0;
    if (file.isFile()) {
      try {
        Charset charset = Charset.forName("UTF-8");
        LineIterator content = FileUtils.lineIterator(file);
        String productId = null;
        String productTitle = null;
        String profileName = null;
        String profileId = null;
        Double score = null;

        while (content.hasNext()) {
          String line = content.nextLine();
          if (line.startsWith("product/productId:")) {
            productId = line.split("product/productId:")[1].trim();
          } else if (line.startsWith("product/title:")) {
            productTitle = line.split("product/title:")[1].trim();
          } else if (line.startsWith("review/userId:")) {
            profileId = line.split("review/userId:")[1].trim();
            if (profileId.trim().equals("unknown")) {
              profileId = profileId.concat("" + i).trim();
            }
          } else if (line.startsWith("review/profileName:")) {
            profileName = line.split("review/profileName:")[1].trim();
          } else if (line.startsWith("review/score:")) {
            score = Double.valueOf(line.split("review/score:")[1].trim());
          } else if (line.startsWith("review/text:")) {
            if (score != null && score > 3.0) {
              if (itemUserMatrixRelevant.containsKey(productId)) {
                ArrayList<String> userIdForItem = itemUserMatrixRelevant.get(productId);
                if (!userIdForItem.contains(profileId)) {
                  userIdForItem.add(profileId);
                }
                itemUserMatrixRelevant.put(productId, userIdForItem);

              } else {
                ArrayList<String> userIdForItem = new ArrayList<String>();
                userIdForItem.add(profileId);
                itemUserMatrixRelevant.put(productId, userIdForItem);
              }

              if (userItemMatrixRelevant.containsKey(profileId)) {
                ArrayList<String> userIdForItem = userItemMatrixRelevant.get(profileId);
                if (!userIdForItem.contains(productId)) {
                  userIdForItem.add(productId);
                }
                userItemMatrixRelevant.put(profileId, userIdForItem);

              } else {
                ArrayList<String> userIdForItem = new ArrayList<String>();
                userIdForItem.add(productId);
                userItemMatrixRelevant.put(profileId, userIdForItem);
              }

            } else {
              if (itemUserMatrixNonRelevant.containsKey(productId)) {
                ArrayList<String> userIdForItem = itemUserMatrixNonRelevant.get(productId);
                if (!userIdForItem.contains(profileId)) {
                  userIdForItem.add(profileId);
                }
                itemUserMatrixNonRelevant.put(productId, userIdForItem);

              } else {
                ArrayList<String> userIdForItem = new ArrayList<String>();
                userIdForItem.add(profileId);
                itemUserMatrixNonRelevant.put(productId, userIdForItem);
              }
            }
            if (userItemMatrix.containsKey(profileId)) {
              ArrayList<String> itemForUserId = userItemMatrix.get(profileId);
              if (!itemForUserId.contains(productId)) {
                itemForUserId.add(productId);
              }
              userItemMatrix.put(profileId, itemForUserId);
            } else {
              ArrayList<String> itemForUserId = new ArrayList<String>();
              itemForUserId.add(productId);
              userItemMatrix.put(profileId, itemForUserId);
            }
            i++;
            if (productId.equals("B00006690A")) {
              System.out.println(productId);
            }
            itemDetails.put(productId, productTitle);
            userDetails.put(profileId, profileName);
          }
        }
        removeNoise();
        if (itemDetails.containsKey("B00006690A")) {
          System.out.println(productId);
        }
      } catch (IOException e) {
        e.printStackTrace();
      }
    }
    System.out.println(
        "total size of the itemUserMatrixRelevant is " + itemUserMatrixRelevant.size());
    System.out.println(
        "total size of the itemUserMatrixNonRelevant is " + itemUserMatrixNonRelevant.size());
    System.out.println("total size of the userItemMatrix is " + userItemMatrix.size());
    System.out.println("total size of ratings for all items " + i);
  }
예제 #25
0
  public int run() {
    int rt = -1;

    String step = leadsfile.split("_")[2];
    if (!"01".equals(step)) {
      logger.info(
          "Leads setp :" + step + " skip leadsInsert",
          leadsfile,
          "",
          "2-1",
          "",
          "名单营销波次为:" + step + ",跳过LeadsInsert.");
      return 0;
    }

    LineIterator it = null;
    JDBCExecute jdbcExecute =
        new JDBCExecute(
            conf.getProperty("leads.insert.db.driverClass"),
            conf.getProperty("leads.insert.db.databaseURL"),
            new KeyMapResource().get(conf.getProperty("leads.insert.db.databaseUser")),
            new KeyMapResource().get(conf.getProperty("leads.insert.db.databasePassword")));

    String getCampSQL = conf.getProperty("leads.insert.get.camp.sql");

    //        JDBCExecute jdbcExecute1 = new JDBCExecute(
    //                conf.getProperty("leads.insert.dwdb.driverClass"),
    //                conf.getProperty("leads.insert.dwdb.databaseURL"),
    //                new KeyMapResource().get(conf.getProperty("leads.insert.dwdb.databaseUser")),
    //                new
    // KeyMapResource().get(conf.getProperty("leads.insert.dwdb.databasePassword"))
    //        );

    //        logger.info("LeadsInsert.getCampSQL:"+getCampSQL,
    //                leadsfile,
    //                "",
    //                "2-1",
    //                "",
    //                "LeadsInsert.getCampSQL:"+getCampSQL);

    List<String[]> sqls = new ArrayList<String[]>();
    List<String> file_sqls = new ArrayList<String>();

    try {

      File readfile = new File(conf.getProperty("leads.insert.readpath") + leadsfile);
      File outfile = new File(conf.getProperty("leads.insert.savepath") + insertfile);
      String tableName = conf.getProperty("leads.insert.db.table.name");

      logger.info(
          "Leads Insert Table Name:" + tableName,
          leadsfile,
          "",
          "2-1",
          "",
          "名单要插入的表为:" + tableName);

      // INSERT INTO ABC(C1,C1) VALUES ('1','2');
      StringBuffer sqlColumnBuffer = new StringBuffer("INSERT INTO");
      sqlColumnBuffer.append(" ");
      sqlColumnBuffer.append(tableName);
      sqlColumnBuffer.append("(");
      StringBuffer sqlValuesBuffer = new StringBuffer("VALUES (");
      int i = 1;
      String tempStr = conf.getProperty("leads.insert.column." + i);
      String[] tempArr;
      List<InsertColumn> insertColumns = new ArrayList<InsertColumn>();
      InsertColumn ic;
      while (tempStr != null) {
        ic = new InsertColumn();

        tempArr = tempStr.split("\\|", -1);
        //  System.out.println(tempStr);
        if (i != 1) {
          sqlColumnBuffer.append(",");
        }

        ic.setName(tempArr[0]);
        ic.setLctype(tempArr[1]);
        ic.setExpstr(tempArr[2]);

        insertColumns.add(ic);
        //  System.out.println(tempArr[0]);
        sqlColumnBuffer.append(tempArr[0]);

        i++;
        tempStr = conf.getProperty("leads.insert.column." + i);

        // System.out.println(tempArr);
      }

      sqlColumnBuffer.append(") ");

      it = FileUtils.lineIterator(readfile, conf.getProperty("leads.insert.file.encoding"));
      String[] sql;
      String line;
      String[] file_columns;
      String[] oneObjects = null;
      String value;
      String selectSQL = conf.getProperty("leads.insert.check.sql");
      String newSelectSQL;
      boolean isOneQuery = false;
      while (it.hasNext()) {
        line = it.nextLine();
        file_columns = line.split("\\|", -1);
        if (!isOneQuery) { // 文件读取循环开始时,执行一次。
          String tempsql = SqlStrUtils.toSQL(getCampSQL, file_columns);
          logger.info(
              "LeadsInsert.getCampSQL:" + tempsql.replaceAll("\'", ""),
              leadsfile,
              "",
              "2-1",
              "",
              "LeadsInsert.getCampSQL:" + tempsql.replaceAll("\'", ""));
          oneObjects = jdbcExecute.getOneObject(tempsql);
          isOneQuery = true;
        }
        for (int j = 0; j < insertColumns.size(); j++) { // int index : indexStrs){
          // indexStrs[j];
          if (j != 0) {
            sqlValuesBuffer.append(",");
          }
          value = toValue(oneObjects, file_columns, insertColumns.get(j));
          if (value != null
              && !value.equals("")) { // toValue(camp_resultSet,file_columns,insertColumns.get(j))
            sqlValuesBuffer.append("'" + value + "'");
          } else {
            sqlValuesBuffer.append("NULL");
          }
        }
        sqlValuesBuffer.append(")");

        newSelectSQL = toExpstrValue(selectSQL, file_columns);

        sql = new String[2];

        sql[0] = sqlColumnBuffer.toString() + sqlValuesBuffer.toString();

        sql[1] = newSelectSQL;
        file_sqls.add(sql[0]);
        sqls.add(sql);
        sqlValuesBuffer.delete(8, sqlValuesBuffer.length());
      }
      // FileUtils.writeLines(outfile,sqls,conf.getProperty("leads.insert.file.encoding"));
      FileUtils.writeLines(outfile, conf.getProperty("leads.insert.file.encoding"), file_sqls);
      jdbcExecute.execute(sqls);
      logger.info(
          "Leads Insert Table[" + tableName + "] success. row count:" + sqls.size(),
          leadsfile,
          "",
          "2-1",
          "",
          "名单要插入的表[" + tableName + "]完成!插入笔数:" + sqls.size());
      logger.info(
          "Leads Insert run success!gen .SQL file:" + insertfile + " ",
          leadsfile,
          "",
          "2-1",
          "LINST00001",
          "LeadsInsert运行成功!生成 " + insertfile + " 完成!");
      rt = 0;
      // List<String[]> leadsList = new ArrayList<String[]>();
    } catch (Exception e) {
      if (e instanceof FileNotFoundException) {
        rt = 805;
        logger.error(
            e.getMessage().replaceAll("\'", ""),
            leadsfile,
            "",
            "2-1",
            "805",
            "没有找到指定文件!" + e.getMessage().replaceAll("\'", ""));
      } else if (e instanceof SQLIntegrityConstraintViolationException) {
        rt = 815;
        logger.error(
            e.getMessage().replaceAll("\'", ""),
            leadsfile,
            "",
            "2-1",
            "815",
            "插入数据库发生异常!" + e.getMessage().replaceAll("\'", ""));
      } else {
        rt = 809;
        e.printStackTrace();
        logger.error(
            "leadsInsert error:" + e.getMessage().replaceAll("\'", ""),
            leadsfile,
            "",
            "2-1",
            "809",
            "程序发生异常!" + e.getMessage().replaceAll("\'", ""));
      }
    } finally {
      LineIterator.closeQuietly(it);
      jdbcExecute.closeConnection();
    }

    return rt;
  }
  private static void processNormalizedIntraContactInformation(
      String fileToRead,
      String baseName,
      String normMethod,
      String chrSmaller,
      ArrayList<DesiredChrContact> contactsToCheck,
      String resolution,
      double minValue,
      TextFile outWriter)
      throws IOException {

    // ReadIn normalization chr1
    TextFile inputNormChr1 =
        new TextFile(
            baseName + "\\chr" + chrSmaller + "_" + resolution + "." + normMethod, TextFile.R);
    ArrayList<String> normFactorSmallerChr = inputNormChr1.readAsArrayList();
    inputNormChr1.close();

    //        System.out.println("Done reading norm factor 1");
    if (!Gpio.exists(fileToRead + ".sorted")) {
      umcg.genetica.io.chrContacts.SortIntraChrContacts.readNonSortedWriteSorted(
          fileToRead, fileToRead + ".sorted");
    }

    int numberToBeMatched = 0;

    LineIterator it = FileUtils.lineIterator(new File(fileToRead + ".sorted"), "UTF-8");

    try {
      while (it.hasNext()) {
        String[] parts = StringUtils.split(it.nextLine(), '\t');

        int posChr1 = org.apache.commons.lang.math.NumberUtils.createInteger(parts[0]);
        int posChr2 = org.apache.commons.lang.math.NumberUtils.createInteger(parts[1]);

        while (numberToBeMatched < contactsToCheck.size()) {
          if (posChr1 < contactsToCheck.get(numberToBeMatched).getChrLocationSmaller()) {
            break;
          } else if (posChr1 == contactsToCheck.get(numberToBeMatched).getChrLocationSmaller()) {
            if (posChr2 < contactsToCheck.get(numberToBeMatched).getChrLocationLarger()) {
              break;
            }
            if (posChr2 == contactsToCheck.get(numberToBeMatched).getChrLocationLarger()) {

              String factor1Base =
                  normFactorSmallerChr.get((posChr1 / getNumericResolution(resolution)) + 1);
              String factor2Base =
                  normFactorSmallerChr.get((posChr2 / getNumericResolution(resolution)) + 1);

              double factor1;
              double factor2;

              if (StringUtils.isNumeric(factor1Base) && StringUtils.isNumeric(factor2Base)) {
                factor1 = org.apache.commons.lang.math.NumberUtils.createDouble(factor1Base);
                factor2 = org.apache.commons.lang.math.NumberUtils.createDouble(factor2Base);

                double contact =
                    org.apache.commons.lang.math.NumberUtils.createDouble(parts[2])
                        / (factor1 * factor2);
                if (contact >= minValue) {
                  outWriter.writeln(
                      contactsToCheck.get(numberToBeMatched).getSnpName()
                          + "\t"
                          + contactsToCheck.get(numberToBeMatched).getProbeName()
                          + "\t"
                          + posChr1
                          + "\t"
                          + posChr2
                          + "\tContact\t"
                          + contact
                          + "\t"
                          + org.apache.commons.lang.math.NumberUtils.createDouble(parts[2]));
                  numberToBeMatched++;
                } else {
                  outWriter.writeln(
                      contactsToCheck.get(numberToBeMatched).getSnpName()
                          + "\t"
                          + contactsToCheck.get(numberToBeMatched).getProbeName()
                          + "\t"
                          + posChr1
                          + "\t"
                          + posChr2
                          + "\t-\t-\t-");
                  numberToBeMatched++;
                }
              } else {
                System.out.println("Error in files.");
                numberToBeMatched++;
              }
            } else if (posChr2 > contactsToCheck.get(numberToBeMatched).getChrLocationLarger()) {
              outWriter.writeln(
                  contactsToCheck.get(numberToBeMatched).getSnpName()
                      + "\t"
                      + contactsToCheck.get(numberToBeMatched).getProbeName()
                      + "\t"
                      + posChr1
                      + "\t"
                      + posChr2
                      + "\t-\t-\t-");
              numberToBeMatched++;
            }
          } else if (posChr1 > contactsToCheck.get(numberToBeMatched).getChrLocationSmaller()) {
            outWriter.writeln(
                contactsToCheck.get(numberToBeMatched).getSnpName()
                    + "\t"
                    + contactsToCheck.get(numberToBeMatched).getProbeName()
                    + "\t"
                    + posChr1
                    + "\t"
                    + posChr2
                    + "\t-\t-\t-");
            numberToBeMatched++;
          }
        }
      }
    } finally {
      LineIterator.closeQuietly(it);
    }
  }
  /**
   * Runs a MAF file through the Oncotator and OMA tools.
   *
   * @param inputMAFURL String
   * @param outputMAFURL String
   * @throws Exception
   */
  @Override
  public void oncotateMAF(String inputMAFURL, String outputMAFURL) throws Exception {

    // sanity check
    if (inputMAFURL == null
        || inputMAFURL.length() == 0
        || outputMAFURL == null
        || outputMAFURL.length() == 0) {
      throw new IllegalArgumentException(
          "oncotateMAFdownloadFile(): url or urlDestination argument is null...");
    }

    URL inputMAF = new URL(inputMAFURL);
    URL outputMAF = new URL(outputMAFURL);

    // determine if we have to call liftover
    boolean cleanOncotatorInputFile = false;
    File oncotatorInputFile = new File(inputMAF.getFile());
    org.apache.commons.io.LineIterator it =
        org.apache.commons.io.FileUtils.lineIterator(oncotatorInputFile);
    it.nextLine(); // skip header
    String[] parts = it.nextLine().split("\t");
    if (parts[3].contains("36") || parts[3].equals("hg18")) {
      it.close();
      File liftoverInputFile =
          org.apache.commons.io.FileUtils.getFile(
              org.apache.commons.io.FileUtils.getTempDirectory(), "liftoverInputFile");
      org.apache.commons.io.FileUtils.copyFile(oncotatorInputFile, liftoverInputFile);
      oncotatorInputFile = new File(inputMAF.getFile());
      // call lift over
      if (LOG.isInfoEnabled()) {
        LOG.info("oncotateMAF(), calling Hg18ToHg19...");
      }
      Hg18ToHg19.driver(
          liftoverInputFile.getCanonicalPath(),
          oncotatorInputFile.getCanonicalPath(),
          getLiftOverBinary(),
          getLiftOverChain());
      org.apache.commons.io.FileUtils.forceDelete(liftoverInputFile);
      cleanOncotatorInputFile = true;
    }

    // create a temp output file from the oncotator
    File oncotatorOutputFile =
        org.apache.commons.io.FileUtils.getFile(
            org.apache.commons.io.FileUtils.getTempDirectory(), "oncotatorOutputFile");
    // call oncotator
    if (LOG.isInfoEnabled()) {
      LOG.info("oncotateMAF(), calling OncotateTool...");
    }
    OncotateTool.driver(
        oncotatorInputFile.getCanonicalPath(),
        oncotatorOutputFile.getCanonicalPath(),
        true,
        true,
        true);
    // we call OMA here -
    // we use output from oncotator as input file
    if (LOG.isInfoEnabled()) {
      LOG.info("oncotateMAF(), calling MutationAssessorTool...");
    }
    File outputMAFFile = new File(outputMAF.getFile());
    outputMAFFile.createNewFile();
    MutationAssessorTool.driver(
        oncotatorOutputFile.getCanonicalPath(),
        outputMAFFile.getCanonicalPath(),
        false,
        true,
        true);

    // clean up
    org.apache.commons.io.FileUtils.forceDelete(oncotatorOutputFile);
    if (cleanOncotatorInputFile) org.apache.commons.io.FileUtils.forceDelete(oncotatorInputFile);
  }
  /**
   * Get the case list from the staging file.
   *
   * @param caseIDs CaseIDs;
   * @param portalMetadata PortalMetadata
   * @param cancerStudyMetadata CancerStudyMetadata
   * @param stagingFilename String
   * @return List<String>
   * @throws Exception
   */
  @Override
  public List<String> getCaseListFromStagingFile(
      CaseIDs caseIDs,
      PortalMetadata portalMetadata,
      CancerStudyMetadata cancerStudyMetadata,
      String stagingFilename)
      throws Exception {

    if (LOG.isInfoEnabled()) {
      LOG.info("getCaseListFromStagingFile(): " + stagingFilename);
    }

    // we use set here
    HashSet<String> caseSet = new HashSet<String>();

    // staging file
    File stagingFile =
        org.apache.commons.io.FileUtils.getFile(
            portalMetadata.getStagingDirectory(),
            cancerStudyMetadata.getStudyPath(),
            stagingFilename);
    // sanity check
    if (!stagingFile.exists()) {
      return new ArrayList<String>();
    }

    // iterate over all rows in file
    org.apache.commons.io.LineIterator it =
        org.apache.commons.io.FileUtils.lineIterator(stagingFile);
    try {
      int mafCaseIDColumnIndex = 0;
      boolean processHeader = true;
      while (it.hasNext()) {
        // create a string list from row in file
        List<String> thisRow = Arrays.asList(it.nextLine().split(Converter.VALUE_DELIMITER));
        // is this the header file?
        if (processHeader) {
          // look for MAF file case id column header
          mafCaseIDColumnIndex = thisRow.indexOf(Converter.MUTATION_CASE_ID_COLUMN_HEADER);
          // this is not a MAF file, header contains the case ids, return here
          if (mafCaseIDColumnIndex == -1) {
            for (String potentialCaseID : thisRow) {
              if (caseIDs.isTumorCaseID(potentialCaseID)) {
                caseSet.add(caseIDs.convertCaseID(potentialCaseID));
              }
            }
            break;
          }
          processHeader = false;
          continue;
        }
        // we want to add the value at mafCaseIDColumnIndex into return set - this is a case ID
        String potentialCaseID = thisRow.get(mafCaseIDColumnIndex);
        if (caseIDs.isTumorCaseID(potentialCaseID)) {
          caseSet.add(caseIDs.convertCaseID(potentialCaseID));
        }
      }
    } finally {
      it.close();
    }

    // outta here
    return new ArrayList<String>(caseSet);
  }