Java DocumentCollection.size示例

编程语言: Java

类/类型: DocumentCollection

方法/功能: size

hotexamples.com的示例: 2

Java DocumentCollection.size - 已找到2个示例。这些是从开源项目中提取的最受好评的DocumentCollection.size现实Java示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

getCount(5)

getFirstDocument(4)

get(2)

getNextDocument(2)

size(2)

findTermFrequency(1)

getBaselinedDocument(1)

getId(1)

getInstance(1)

open(1)

示例#1

显示文件

文件： HMMTagger.java 项目： keeyon2/NYU-School-Shizzle

  /**
   * train the tagger using the DocumentCollection in file 'trainingCollection'.
   * 'trainingCollection' should consist of documents which have been explicitly tagged with
   * part-of-speech information.
   */
  void train(String trainingCollection) {

    for (int i = 0; i < posTable.length; i++)
      tagTable[i] = new String[] {"constit", "cat", posTable[i], posTable[i]};

    // build ergodic HMM with one state for each POS (plus start and end states)

    HMMstate startState = new HMMstate("start", "", WordFeatureHMMemitter.class);
    posh.addState(startState);
    for (int j = 0; j < posTable.length; j++) startState.addArc(new HMMarc(posTable[j], 0));
    HMMstate endState = new HMMstate("end", "", WordFeatureHMMemitter.class);
    posh.addState(endState);
    for (int i = 0; i < posTable.length; i++) {
      String pos = posTable[i];
      HMMstate state = new HMMstate(pos, pos, WordFeatureHMMemitter.class);
      posh.addState(state);
      for (int j = 0; j < posTable.length; j++) state.addArc(new HMMarc(posTable[j], 0));
      state.addArc(new HMMarc("end", 0));
    }
    posh.resolveNames();

    posh.resetForTraining();
    annotator = new HMMannotator(posh);
    annotator.setTagTable(tagTable);
    annotator.setBItag(false);

    DocumentCollection col = new DocumentCollection(trainingCollection);
    col.open();
    for (int i = 0; i < col.size(); i++) {
      ExternalDocument doc = col.get(i);
      doc.open();
      System.out.println("Training from " + doc.fileName());

      // divide at endmarks (constit cat="."), adding "S" marks

      int posn = 0;
      int start = posn;
      Vector anns;
      while ((anns = doc.annotationsAt(posn, "constit")) != null) {
        Annotation ann = (Annotation) anns.get(0);
        posn = ann.span().end();
        String pos = (String) ann.get("cat");
        if (pos.equals(".")) {
          doc.annotate("S", new Span(start, posn), new FeatureSet());
          start = posn;
        }
      }
      annotator.train(doc);
      //  free up space taken by annotations on document
      doc.clearAnnotations();
    }
    posh.computeProbabilities();
  }

示例#2

显示文件

文件： App.java 项目： chasangchual/Java-Reference-Code

  public void run() {
    List<TestFailure> failures = dto.loadFailures();
    DocumentCollection documents = DocumentCollection.getInstance();
    TermCollection terms = TermCollection.getInstance();

    TestFailureUtil.buildDocumentCollection(failures, documents);
    TestFailureUtil.buildTermCollection(failures, terms);

    for (int i = 0; i < terms.size(); i++) {
      String term = terms.get(i);
      Integer count = documents.findTermFrequency(term);
      // Calculate idf weight against a term
      Float weight = (float) Math.log((double) documents.size() / (double) count);
      terms.setWeithg(term, weight);
      // System.out.println(String.valueOf(i) + "." + term + " : " + String.valueOf(weight)) ;
    }

    for (int i = 0; i < documents.size(); i++) {
      String document = documents.get(i);
      String[] words = document.split("\\s+");

      System.out.println(document);

      for (String word : words) {
        TermItem item = new TermItem();
        float tfidf = VectorSpaceModel.findTFIDF(documents.get(i), word);

        item.setTerm(word);
        item.setTfidf(tfidf);

        Integer pos = -1;
        while ((pos = document.indexOf(word, pos + 1)) >= 0) {
          item.addPos(pos);
        }

        if (tfidf > 0.0) System.out.println(item.toString());
      }
    }
  }