private static Hashtable<String, String> readMappingFile(String fName) throws IOException { Hashtable<String, String> res = new Hashtable<String, String>(); res.put("$MAPPED_VECS", "$MAPPED_VECS"); BufferedReader br = FileUtils.openFile("Mapping file", fName); String line = null; while ((line = br.readLine()) != null) { String[] splits = line.split("\t", 2); String origString = splits[0]; String targetString = splits[1]; res.put(origString, targetString); } br.close(); return res; }
public static void main(String[] args) throws IOException { // register and parse all options JSAPResult config = OptionFactory.parseResults(args, OPTIONS); String inputFileName = AbstractOptionFactory.getFilePath(config, "input"); String mappingFile = AbstractOptionFactory.getFilePath(config, "nameMappingFile"); String outputFileName = AbstractOptionFactory.getFilePath(config, "output"); String stripFromLabel = config.getString("stripFromString"); boolean onlyLastPathSegment = config.getBoolean("onlyLastPathSegment"); boolean gzip = config.getBoolean("gzip"); MatchMode matchMode = MatchMode.valueOf(config.getString("matchMode")); Hashtable<String, String> mapping = readMappingFile(mappingFile); String[] keys = null; if (matchMode == MatchMode.endsWith) { // defining the keys ones as String[] is still slow, but faster than using Hashmap#keySet() // often keys = mapping.keySet().toArray(new String[mapping.size()]); } OutputStream out = gzip ? new GZIPOutputStream(new FileOutputStream(outputFileName)) : new FileOutputStream(outputFileName); BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(out)); HashMap<String, String> headers = FileUtils.readSOMLibFileHeaders( FileUtils.openFile("Input vector", inputFileName), "input vector"); int totalVectorCount = Integer.parseInt(headers.get("$XDIM")); BufferedReader br = FileUtils.openFile("Input vector", inputFileName); String line = null; StdErrProgressWriter progress = new StdErrProgressWriter(totalVectorCount, "rewriting vector ", 100); int written = 0; int skipped = 0; while ((line = br.readLine()) != null) { if (line.startsWith("$")) { bw.write(line); bw.newLine(); } else { int lastPos = line.lastIndexOf(" "); String label = line.substring(lastPos + 1); // System.out.println(stripFromLabel); // System.out.print(label); label = label.replaceAll(stripFromLabel, ""); if (onlyLastPathSegment && label.contains("/")) { label = label.substring(label.lastIndexOf("/") + 1); } // System.out.println("=>" + label); String target = getReplacement(mapping, keys, label, matchMode); if (target != null) { bw.write(line.substring(0, lastPos)); bw.write(" " + target); bw.newLine(); written++; } else { // System.out.println("No label found for " + label); skipped++; } progress.progress(); } } br.close(); bw.close(); System.out.println( "Wrote " + written + " vectors, skipped " + skipped + " because no label found in matching file"); }