public static void main(String[] args) { String param1 = args[0].length() > 0 ? args[0] : "brown"; Corpus c = null; if (param1.equals("brown")) { c = new Brown(); } else if (param1.equals("negra")) { c = new Negra(); } else { System.err.println( "Illegal parameter! Using standard parameter " + STANDARD_PARAMETER_STRING + " instead."); param1 = "brown"; c = new Brown(); } int sizeCorpus = c.getSize(); int sizeSample = (int) (sizeCorpus * CORPUS_PERCENTAGE); /* * extracting a sample from the corpus and splitting it up into training set and test set * (currently only the line numbers, lines themselves are written to file) */ c.extractSample(sizeSample); c.splitSample("tree"); }
// marginalise over all corpus files using threads public synchronized void count() { Helper.report("[ContextCounter] Counting over all corpus files..."); File corpusFolder = new File(DepNeighbourhoodSpace.getProjectFolder(), Corpus.getFolderName()); String[] corpusFilenames = corpusFolder.list(); Arrays.sort(corpusFilenames); // run each dep marginaliser thread for (String corpusFilename : corpusFilenames) { DepContextCounterThread ccThread = new DepContextCounterThread( this, corpusFilename, new File(corpusFolder, corpusFilename), amountOfSentences); threads.add(ccThread); (new Thread(ccThread)).start(); } // wait for all threads to finish try { while (!threads.isEmpty()) { wait(); } } catch (InterruptedException e) { } Helper.report("[ContextCounter] ...Finished counting over all corpus files..."); }
public static DepContextCounts importFromReader(BufferedReader in) throws IOException { Helper.report("[ContextCounts] Importing context word counts..."); DepContextCounts dmc = new DepContextCounts(); String line; while ((line = in.readLine()) != null) { if (line.startsWith("<contextcounts")) { Matcher matcher = contextCountsPattern.matcher(line); if (matcher.find()) { // ignore first entry: corpus name Corpus.setName(matcher.group(1)); } } else if (line.startsWith("<deprelation")) { Matcher matcher = depRelationPattern.matcher(line); if (matcher.find()) { // ignore first entry: corpus name String depRelationString = matcher.group(1); importDepRelationCounts(in, dmc, depRelationString); } } else if (line.equals("</contextcounts>")) { break; } } Helper.report("[ContextCounts] ...Finished importing context word counts."); return dmc; }
public DepContextCounter(String corpusFolderName, int amountOfSentences) { Corpus.setFolderName(corpusFolderName); threads = new HashSet<>(); counts = new DepContextCounts(); this.amountOfSentences = amountOfSentences; // how many sentences per corpus file should be included in the count? }
@Override public String toString() { String s = "CORPUS \"" + Corpus.getName() + "\"\n"; for (String depRelationString : depRelationWordCountMap.keySet()) { HashMap<String, Long> wordCountMap = depRelationWordCountMap.get(depRelationString); s += "DEPRELATION \"" + depRelationString + "\"\n"; int i = 0; for (String contextWord : wordCountMap.keySet()) { long count = wordCountMap.get(contextWord); s += count + "\t" + contextWord + "\n"; if (++i >= 5) break; } } return s; }