Esempio n. 1
0
  private static Hashtable<String, String> readMappingFile(String fName) throws IOException {
    Hashtable<String, String> res = new Hashtable<String, String>();
    res.put("$MAPPED_VECS", "$MAPPED_VECS");
    BufferedReader br = FileUtils.openFile("Mapping file", fName);

    String line = null;
    while ((line = br.readLine()) != null) {
      String[] splits = line.split("\t", 2);
      String origString = splits[0];
      String targetString = splits[1];
      res.put(origString, targetString);
    }

    br.close();

    return res;
  }
Esempio n. 2
0
  public static void main(String[] args) throws IOException {
    // register and parse all options
    JSAPResult config = OptionFactory.parseResults(args, OPTIONS);

    String inputFileName = AbstractOptionFactory.getFilePath(config, "input");
    String mappingFile = AbstractOptionFactory.getFilePath(config, "nameMappingFile");
    String outputFileName = AbstractOptionFactory.getFilePath(config, "output");

    String stripFromLabel = config.getString("stripFromString");
    boolean onlyLastPathSegment = config.getBoolean("onlyLastPathSegment");

    boolean gzip = config.getBoolean("gzip");

    MatchMode matchMode = MatchMode.valueOf(config.getString("matchMode"));

    Hashtable<String, String> mapping = readMappingFile(mappingFile);
    String[] keys = null;
    if (matchMode == MatchMode.endsWith) {
      // defining the keys ones as String[] is still slow, but faster than using Hashmap#keySet()
      // often
      keys = mapping.keySet().toArray(new String[mapping.size()]);
    }

    OutputStream out =
        gzip
            ? new GZIPOutputStream(new FileOutputStream(outputFileName))
            : new FileOutputStream(outputFileName);
    BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(out));

    HashMap<String, String> headers =
        FileUtils.readSOMLibFileHeaders(
            FileUtils.openFile("Input vector", inputFileName), "input vector");
    int totalVectorCount = Integer.parseInt(headers.get("$XDIM"));

    BufferedReader br = FileUtils.openFile("Input vector", inputFileName);
    String line = null;
    StdErrProgressWriter progress =
        new StdErrProgressWriter(totalVectorCount, "rewriting vector ", 100);
    int written = 0;
    int skipped = 0;
    while ((line = br.readLine()) != null) {
      if (line.startsWith("$")) {
        bw.write(line);
        bw.newLine();
      } else {
        int lastPos = line.lastIndexOf(" ");
        String label = line.substring(lastPos + 1);

        // System.out.println(stripFromLabel);
        // System.out.print(label);
        label = label.replaceAll(stripFromLabel, "");
        if (onlyLastPathSegment && label.contains("/")) {
          label = label.substring(label.lastIndexOf("/") + 1);
        }
        // System.out.println("=>" + label);

        String target = getReplacement(mapping, keys, label, matchMode);
        if (target != null) {
          bw.write(line.substring(0, lastPos));
          bw.write(" " + target);
          bw.newLine();
          written++;
        } else {
          // System.out.println("No label found for " + label);
          skipped++;
        }

        progress.progress();
      }
    }

    br.close();
    bw.close();

    System.out.println(
        "Wrote "
            + written
            + " vectors, skipped "
            + skipped
            + " because no label found in matching file");
  }