private static void execSearchDoeCorpus(String[] args) { String searchString = args[1]; boolean interactive = false; // Load corpus first execParseDoeCorpus(false /* doSave */); // Check mode if (args.length > 2) if (args[2].equals("i")) interactive = true; // Search for string HashMap<String, ArrayList<DOECorpusLine>> matches = corpus.getConcordances(searchString); printSearchResults(searchString, matches); if (interactive) { do { System.out.println("Enter the sear term here (or QUIT to exit):"); Scanner scanIn = new Scanner(System.in); searchString = scanIn.nextLine(); scanIn.close(); if (!("QUIT".equals(searchString))) { matches = corpus.getConcordances(searchString); printSearchResults(searchString, matches); } } while (!("QUIT".equals(searchString))); } }
private static void execFindConcordancesDoeCorpus(String[] args) { int limit = 1000000; if (args.length > 2) limit = 100; String searchString = args[1]; execParseDoeCorpus(false /* doSave */); ArrayList<DOECorpusLine> allLines = new ArrayList<DOECorpusLine>(); HashMap<String, ArrayList<DOECorpusLine>> matches = corpus.getConcordances(searchString); for (ArrayList<DOECorpusLine> list : matches.values()) allLines.addAll(list); if (allLines.size() > limit) allLines = getRandomSelectionFromList(allLines, limit); System.out.println("Concordances for search term " + searchString + " with limit " + limit); for (DOECorpusLine line : allLines) { System.out.println( "Doc ID: " + line.getShortTitle() + "; Line ID: " + line.getLineID() + "; " + line.getLine()); } }
private static void execConvertToMalletFormat(String[] args) { if (args.length != 5) throw new IllegalArgumentException( "There should be 5 arguments, but I found only " + args.length); // Load corpus first execParseDoeCorpus(false /* doSave */); String targetTerm = args[1]; String sourceFile = args[2]; String targetFile = args[3]; int termWindowSize = Integer.parseInt(args[4]); System.out.println("Reading concordance file for target term '" + targetTerm + "'..."); System.out.println("File path: " + sourceFile); BufferedReader in = null; String malletFileString = ""; try { in = new BufferedReader(new FileReader(sourceFile)); String str; while ((str = in.readLine()) != null) { String[] lineArray = str.split(";"); String docID = lineArray[0].replace("Doc ID: ", "").trim(); String lineID = lineArray[1].replace("Line ID: ", "").trim(); String instanceID = (docID + "_" + lineID).replaceAll(" ", "_"); String senseID = lineArray[2].replace("DOE sense ID: ", "").trim(); String text = lineArray[3]; String data = corpus.getWindowTokensAsSpaceSeparatedString(targetTerm, docID, lineID, termWindowSize); if (data != null) { String malletInstanceLine = instanceID + " " + senseID + " " + data + "\n"; malletFileString += malletInstanceLine; } } in.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("Writing result to Mallet file..."); System.out.println("File path: " + targetFile); PrintWriter out; try { out = new PrintWriter(targetFile); out.println(malletFileString); out.close(); } catch (FileNotFoundException e) { // TODO Auto-generated catch block e.printStackTrace(); } System.out.println("Writing result to Mallet file...DONE"); }
private static void execFindOccurrencesDoeCorpus(String[] args) { int minCount = Integer.parseInt(args[1]); int maxCount = Integer.parseInt(args[2]); int minLen = Integer.parseInt(args[3]); // Load corpus first execParseDoeCorpus(false /* doSave */); corpus.findTypesWithMinMaxOccurrenceAndMinLength(minCount, maxCount, minLen); }
private static void execParseDoeCorpus(boolean doSave) { if (corpus != null) { System.out.println("Using pre-loaded corpus."); return; } DOECorpusReader corpusReader = new DOECorpusReader(DOE_CORPUS_SOURCE_DIR, DOE_CORPUS_SAVE_DIR, doSave); corpus = corpusReader.loadFromHTML(); if (doSave) corpus.saveToFile(DOE_CORPUS_SAVE_DIR + "doe.corpus"); }
private static void execPrintTypesDoeCorpus(String[] args) { boolean ascending = true; // Ascending or descending? if (args.length > 1) if ("asc".equalsIgnoreCase(args[1])) ascending = true; else if ("desc".equalsIgnoreCase(args[1])) ascending = false; else throw new IllegalArgumentException("Second parameter must be either 'asc' or 'desc'"); // Save to file? String path = ""; List<String> lines = null; boolean writeToFile = false; if (args.length > 2) { path = args[2]; lines = new ArrayList<>(); writeToFile = true; } // Load corpus first execParseDoeCorpus(false /* doSave */); HashMap<String, Integer> types = corpus.getVocabulary(); Map<String, Integer> sortedTypes = ListAndMapUtil.sortByComparator(types, ascending); for (String key : sortedTypes.keySet()) { String line = key + ";" + sortedTypes.get(key); System.out.println(line); if (writeToFile) lines.add(line); } // Save to file if (writeToFile) try { FileUtils.writeLines(new File(path), lines); } catch (IOException e) { System.out.println("Error while trying to write to file " + path); e.printStackTrace(); } }
private static void execLoadDoeCorpus(ArrayList<String> params) { DOECorpus corpus = DOECorpus.loadFromFile(DOE_CORPUS_SAVE_DIR + "doe.corpus"); }
/** * @param targetTerm * @param sourceFile * @param termWindowSize * @param pipe */ private static InstanceList readConcordanceFileToInstanceList( String targetTerm, String sourceFile, int termWindowSize, Pipe pipe, boolean useCollocationalVector) { InstanceList instanceList = new InstanceList(pipe); BufferedReader in = null; try { in = new BufferedReader(new FileReader(sourceFile)); int incomplete = 0; String str; while ((str = in.readLine()) != null) { String[] lineArray = str.split(";"); if (lineArray.length != 4) { System.out.println( "WARNING: Skipping possibly invalid CSV line " + str + " in file " + sourceFile); continue; } String docID = lineArray[0].replace("Doc ID: ", "").trim(); String lineID = lineArray[1].replace("Line ID: ", "").trim(); String instanceID = (docID + "_" + lineID).replaceAll(" ", "_"); String senseID = lineArray[2].replace("DOE sense ID: ", "").trim(); String text = lineArray[3]; if (targetTerm.equals("faeder")) targetTerm = "fæder"; ArrayList<String> data = corpus.getWindowTokens(targetTerm, docID, lineID, termWindowSize); if (data.size() != 2 * termWindowSize) { incomplete++; System.out.println("WARNING: Incomplete token list " + incomplete + " found " + data); } if (useCollocationalVector) { System.out.println("Converting data to collocational vector: \n\t" + data); int i = termWindowSize * (-1); int index = i + termWindowSize; while (i <= termWindowSize && index < data.size()) { if (i != 0) { data.set(index, data.get(index) + "_" + i); // skip position of target term index++; } i++; } System.out.println("Converting data to collocational vector...DONE\n\t" + data); } String dataStr = data.toString().replace(", ", " ").replace("[", "").replace("]", "").replace(".", ""); Instance trainingInstance = new Instance(dataStr, senseID, instanceID, text); instanceList.addThruPipe(trainingInstance); } in.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (in != null) try { in.close(); } catch (IOException e1) { } } return instanceList; }