예제 #1
0
  public static void main(String[] args) {
    String param1 = args[0].length() > 0 ? args[0] : "brown";

    Corpus c = null;
    if (param1.equals("brown")) {
      c = new Brown();
    } else if (param1.equals("negra")) {
      c = new Negra();
    } else {
      System.err.println(
          "Illegal parameter! Using standard parameter " + STANDARD_PARAMETER_STRING + " instead.");
      param1 = "brown";
      c = new Brown();
    }

    int sizeCorpus = c.getSize();
    int sizeSample = (int) (sizeCorpus * CORPUS_PERCENTAGE);

    /*
     * extracting a sample from the corpus and splitting it up into training set and test set
     * (currently only the line numbers, lines themselves are written to file)
     */

    c.extractSample(sizeSample);
    c.splitSample("tree");
  }
예제 #2
0
  // marginalise over all corpus files using threads
  public synchronized void count() {
    Helper.report("[ContextCounter] Counting over all corpus files...");

    File corpusFolder = new File(DepNeighbourhoodSpace.getProjectFolder(), Corpus.getFolderName());
    String[] corpusFilenames = corpusFolder.list();
    Arrays.sort(corpusFilenames);

    // run each dep marginaliser thread
    for (String corpusFilename : corpusFilenames) {
      DepContextCounterThread ccThread =
          new DepContextCounterThread(
              this, corpusFilename, new File(corpusFolder, corpusFilename), amountOfSentences);
      threads.add(ccThread);
      (new Thread(ccThread)).start();
    }

    // wait for all threads to finish
    try {
      while (!threads.isEmpty()) {
        wait();
      }
    } catch (InterruptedException e) {
    }

    Helper.report("[ContextCounter] ...Finished counting over all corpus files...");
  }
예제 #3
0
  public static DepContextCounts importFromReader(BufferedReader in) throws IOException {
    Helper.report("[ContextCounts] Importing context word counts...");
    DepContextCounts dmc = new DepContextCounts();

    String line;
    while ((line = in.readLine()) != null) {

      if (line.startsWith("<contextcounts")) {
        Matcher matcher = contextCountsPattern.matcher(line);
        if (matcher.find()) { // ignore first entry: corpus name
          Corpus.setName(matcher.group(1));
        }

      } else if (line.startsWith("<deprelation")) {
        Matcher matcher = depRelationPattern.matcher(line);
        if (matcher.find()) { // ignore first entry: corpus name
          String depRelationString = matcher.group(1);
          importDepRelationCounts(in, dmc, depRelationString);
        }

      } else if (line.equals("</contextcounts>")) {
        break;
      }
    }

    Helper.report("[ContextCounts] ...Finished importing context word counts.");
    return dmc;
  }
예제 #4
0
 public DepContextCounter(String corpusFolderName, int amountOfSentences) {
   Corpus.setFolderName(corpusFolderName);
   threads = new HashSet<>();
   counts = new DepContextCounts();
   this.amountOfSentences =
       amountOfSentences; // how many sentences per corpus file should be included in the count?
 }
예제 #5
0
  @Override
  public String toString() {
    String s = "CORPUS \"" + Corpus.getName() + "\"\n";
    for (String depRelationString : depRelationWordCountMap.keySet()) {
      HashMap<String, Long> wordCountMap = depRelationWordCountMap.get(depRelationString);
      s += "DEPRELATION \"" + depRelationString + "\"\n";
      int i = 0;
      for (String contextWord : wordCountMap.keySet()) {
        long count = wordCountMap.get(contextWord);
        s += count + "\t" + contextWord + "\n";
        if (++i >= 5) break;
      }
    }

    return s;
  }