/** * @param targetTerm * @param sourceFile * @param termWindowSize * @param pipe */ private static InstanceList readConcordanceFileToInstanceList( String targetTerm, String sourceFile, int termWindowSize, Pipe pipe, boolean useCollocationalVector) { InstanceList instanceList = new InstanceList(pipe); BufferedReader in = null; try { in = new BufferedReader(new FileReader(sourceFile)); int incomplete = 0; String str; while ((str = in.readLine()) != null) { String[] lineArray = str.split(";"); if (lineArray.length != 4) { System.out.println( "WARNING: Skipping possibly invalid CSV line " + str + " in file " + sourceFile); continue; } String docID = lineArray[0].replace("Doc ID: ", "").trim(); String lineID = lineArray[1].replace("Line ID: ", "").trim(); String instanceID = (docID + "_" + lineID).replaceAll(" ", "_"); String senseID = lineArray[2].replace("DOE sense ID: ", "").trim(); String text = lineArray[3]; if (targetTerm.equals("faeder")) targetTerm = "fæder"; ArrayList<String> data = corpus.getWindowTokens(targetTerm, docID, lineID, termWindowSize); if (data.size() != 2 * termWindowSize) { incomplete++; System.out.println("WARNING: Incomplete token list " + incomplete + " found " + data); } if (useCollocationalVector) { System.out.println("Converting data to collocational vector: \n\t" + data); int i = termWindowSize * (-1); int index = i + termWindowSize; while (i <= termWindowSize && index < data.size()) { if (i != 0) { data.set(index, data.get(index) + "_" + i); // skip position of target term index++; } i++; } System.out.println("Converting data to collocational vector...DONE\n\t" + data); } String dataStr = data.toString().replace(", ", " ").replace("[", "").replace("]", "").replace(".", ""); Instance trainingInstance = new Instance(dataStr, senseID, instanceID, text); instanceList.addThruPipe(trainingInstance); } in.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (in != null) try { in.close(); } catch (IOException e1) { } } return instanceList; }