@Test public void testWithCommentStart() throws Exception { final CSVFormat formatWithCommentStart = CSVFormat.DEFAULT.withCommentMarker('#'); assertEquals(Character.valueOf('#'), formatWithCommentStart.getCommentMarker()); }
@Test(expected = IllegalArgumentException.class) public void testWithCommentStartCRThrowsException() { CSVFormat.DEFAULT.withCommentMarker(CR); }
/** @author Ivan Habernal */ public final class SVMHMMUtils { /** File name of serialized mapping from String labels to numbers */ public static final String LABELS_TO_INTEGERS_MAPPING_FILE_NAME = "labelsToIntegersMapping_DualTreeBidiMap.bin"; /** CSV file comment */ public static final String CSV_COMMENT = "Columns: gold, predicted, token, seqID"; /** Format of CSV files */ public static final CSVFormat CSV_FORMAT = CSVFormat.DEFAULT.withCommentMarker('#'); /** Where the gold outcomes, predicted outcomes, and tokens are stored */ public static final String GOLD_PREDICTED_OUTCOMES_CSV = "outcomesGoldPredicted.csv"; private SVMHMMUtils() { // empty } /** * Extract all outcomes from featureVectorsFiles (training, test) that are in LIBSVM format - each * line is a feature vector and the first token is the outcome label * * @param files files in LIBSVM format * @return set of all unique outcomes * @throws java.io.IOException */ public static SortedSet<String> extractOutcomeLabelsFromFeatureVectorFiles(File... files) throws IOException { SortedSet<String> result = new TreeSet<>(); for (File file : files) { result.addAll(extractOutcomeLabels(file)); } return result; } /** * Maps names to numbers (numbers are required by SVMLight format) * * @param names names (e.g., features, outcomes) * @return bidirectional map of name:number */ public static BidiMap mapVocabularyToIntegers(SortedSet<String> names) { BidiMap result = new DualTreeBidiMap(); // start numbering from 1 int index = 1; for (String featureName : names) { result.put(featureName, index); index++; } return result; } /** * Creates a new file in the same directory as {@code featureVectorsFile} and replaces the first * token (outcome label) by its corresponding integer number from the bi-di map * * @param featureVectorsFile file * @param labelsToIntegers mapping * @return new file */ public static File replaceLabelsWithIntegers(File featureVectorsFile, BidiMap labelsToIntegers) throws IOException { File result = new File( featureVectorsFile.getParent(), "mappedLabelsToInt_" + featureVectorsFile.getName()); PrintWriter pw = new PrintWriter(new FileOutputStream(result)); BufferedReader br = new BufferedReader(new FileReader(featureVectorsFile)); String line = null; while ((line = br.readLine()) != null) { // split on the first whitespaces, keep the rest String[] split = line.split("\\s", 2); String label = split[0]; String remainingContent = split[1]; // find the integer Integer intOutput = (Integer) labelsToIntegers.get(label); // print to the output stream pw.printf("%d %s%n", intOutput, remainingContent); } IOUtils.closeQuietly(pw); IOUtils.closeQuietly(br); return result; } /** * Saves label-integer mapping to a file * * @param mapping mapping * @param outputFile file * @throws IOException */ public static void saveMapping(BidiMap mapping, File outputFile) throws IOException { ObjectOutputStream objectOutputStream = new ObjectOutputStream(new FileOutputStream(outputFile)); objectOutputStream.writeObject(mapping); IOUtils.closeQuietly(objectOutputStream); } /** * Saves the feature mapping to readable format, each line is a feature id and feature name, * sorted by feature id * * @param mapping mapping (name:id) * @param outputFile output file * @throws IOException */ public static void saveMappingTextFormat(BidiMap mapping, File outputFile) throws IOException { PrintWriter pw = new PrintWriter(new FileOutputStream(outputFile)); // sort values (feature indexes) @SuppressWarnings("unchecked") SortedSet<Object> featureIndexes = new TreeSet<Object>(mapping.values()); for (Object featureIndex : featureIndexes) { pw.printf( Locale.ENGLISH, "%5d %s%n", (int) featureIndex, mapping.getKey(featureIndex).toString()); } IOUtils.closeQuietly(pw); } /** * Loads a serialized BidiMap from file * * @param inputFile input file * @return BidiMap * @throws IOException */ public static BidiMap loadMapping(File inputFile) throws IOException { ObjectInputStream inputStream = new ObjectInputStream(new FileInputStream(inputFile)); try { return (BidiMap) inputStream.readObject(); } catch (ClassNotFoundException e) { throw new IOException(e); } finally { IOUtils.closeQuietly(inputStream); } } /** * Extracts the outcome labels from the file; it corresponds to the first token on each line. * * @param featureVectorsFile featureVectors file * @return list of outcome labels * @throws IOException */ public static List<String> extractOutcomeLabels(File featureVectorsFile) throws IOException { List<String> result = new ArrayList<>(); List<String> lines = FileUtils.readLines(featureVectorsFile); for (String line : lines) { String label = line.split("\\s")[0]; result.add(label); } return result; } /** * Reads the featureVectorsFile and splits comment on each line into a list of strings, i.e. "TAG * qid:4 1:1 2:1 4:2 # token TAG 4" produces "token", "TAG", "4" * * @param featureVectorsFileStream featureVectors file stream * @return list (for each line) of list of comment parts * @throws IOException */ protected static Iterator<List<String>> extractComments(final File featureVectorsFileStream) throws IOException { return new CommentsIterator(featureVectorsFileStream); } /** * Extracts original tokens that are stored in the comment part of the featureVectorsFile * * @param featureVectorsFile featureVectors file * @return list of original tokens * @throws IOException */ public static List<String> extractOriginalTokens(File featureVectorsFile) throws IOException { List<String> result = new ArrayList<>(); Iterator<List<String>> comments = extractComments(featureVectorsFile); while (comments.hasNext()) { List<String> comment = comments.next(); // original token is the first one in comments result.add(comment.get(2)); } return result; } /** * Reads the prediction file (each line is a integer) and converts them into original outcome * labels using the mapping provided by the bi-directional map * * @param predictionsFile predictions from classifier * @param labelsToIntegersMapping mapping outcomeLabel:integer * @return list of outcome labels * @throws IOException */ public static List<String> extractOutcomeLabelsFromPredictions( File predictionsFile, BidiMap labelsToIntegersMapping) throws IOException { List<String> result = new ArrayList<>(); for (String line : FileUtils.readLines(predictionsFile)) { Integer intLabel = Integer.valueOf(line); String outcomeLabel = (String) labelsToIntegersMapping.getKey(intLabel); result.add(outcomeLabel); } return result; } /** * Returns a list of original sequence IDs extracted from comments * * @param featureVectorsFile featureVectors file * @return list of integers * @throws IOException */ public static List<Integer> extractOriginalSequenceIDs(File featureVectorsFile) throws IOException { List<Integer> result = new ArrayList<>(); Iterator<List<String>> comments = extractComments(featureVectorsFile); while (comments.hasNext()) { List<String> comment = comments.next(); // sequence number is the third token in the comment token result.add(Integer.valueOf(comment.get(1))); } return result; } /** * Given confusion matrix, it writes it in CSV and LaTeX form to the tasks output directory, and * also prints evaluations (F-measure, Precision, Recall) * * @param context task context * @param confusionMatrix confusion matrix * @throws java.io.IOException */ public static void writeOutputResults(TaskContext context, ConfusionMatrix confusionMatrix) throws IOException { writeOutputResults(context, confusionMatrix, null); } /** * Given confusion matrix, it writes it in CSV and LaTeX form to the tasks output directory, and * also prints evaluations (F-measure, Precision, Recall) * * @param context task context * @param confusionMatrix confusion matrix * @param filePrefix prefix of output files * @throws java.io.IOException */ public static void writeOutputResults( TaskContext context, ConfusionMatrix confusionMatrix, String filePrefix) throws IOException { // storing the results as latex confusion matrix String confMatrixFileTex = (filePrefix != null ? filePrefix : "") + "confusionMatrix.tex"; File matrixFolderTex = context.getFolder(Constants.TEST_TASK_OUTPUT_KEY, StorageService.AccessMode.READWRITE); File evaluationFileLaTeX = new File(matrixFolderTex, confMatrixFileTex); FileUtils.writeStringToFile(evaluationFileLaTeX, confusionMatrix.toStringLatex()); // as CSV confusion matrix String confMatrixFileCsv = (filePrefix != null ? filePrefix : "") + "confusionMatrix.csv"; File matrixFolder = context.getFolder(Constants.TEST_TASK_OUTPUT_KEY, StorageService.AccessMode.READWRITE); File evaluationFileCSV = new File(matrixFolder, confMatrixFileCsv); CSVPrinter csvPrinter = new CSVPrinter(new FileWriter(evaluationFileCSV), CSVFormat.DEFAULT); csvPrinter.printRecords(confusionMatrix.toStringMatrix()); IOUtils.closeQuietly(csvPrinter); // and results File evalFolder = context.getFolder(Constants.TEST_TASK_OUTPUT_KEY, StorageService.AccessMode.READWRITE); String evalFileName = new SVMHMMAdapter() .getFrameworkFilename(TCMachineLearningAdapter.AdapterNameEntries.evaluationFile); File evaluationFile = new File(evalFolder, evalFileName); PrintWriter pw = new PrintWriter(evaluationFile); pw.println(confusionMatrix.printNiceResults()); pw.println(confusionMatrix.printLabelPrecRecFm()); pw.println(confusionMatrix.printClassDistributionGold()); IOUtils.closeQuietly(pw); } public static List<SortedMap<String, String>> extractMetaDataFeatures(File featureVectorsFile) throws IOException { InputStream inputStream = new FileInputStream(featureVectorsFile); List<SortedMap<String, String>> result = new ArrayList<>(); Iterator<List<String>> allComments = extractComments(featureVectorsFile); while (allComments.hasNext()) { List<String> instanceComments = allComments.next(); SortedMap<String, String> instanceResult = new TreeMap<>(); for (String comment : instanceComments) { if (comment.startsWith(SVMHMMDataWriter.META_DATA_FEATURE_PREFIX)) { String[] split = comment.split(":"); String key = split[0]; String value = split[1]; instanceResult.put(key, value); } } result.add(instanceResult); } IOUtils.closeQuietly(inputStream); return result; } }