예제 #1
0
파일: LDA.java 프로젝트: shamsa-abid/habiba
 public void printDocumentTopics(PrintWriter pw, double threshold, int max) {
   pw.println("#doc source topic proportion ...");
   int docLen;
   double topicDist[] = new double[topics.length];
   for (int di = 0; di < topics.length; di++) {
     pw.print(di);
     pw.print(' ');
     if (ilist.get(di).getSource() != null) {
       pw.print(ilist.get(di).getSource().toString());
     } else {
       pw.print("null-source");
     }
     pw.print(' ');
     docLen = topics[di].length;
     for (int ti = 0; ti < numTopics; ti++)
       topicDist[ti] = (((float) docTopicCounts[di][ti]) / docLen);
     if (max < 0) max = numTopics;
     for (int tp = 0; tp < max; tp++) {
       double maxvalue = 0;
       int maxindex = -1;
       for (int ti = 0; ti < numTopics; ti++)
         if (topicDist[ti] > maxvalue) {
           maxvalue = topicDist[ti];
           maxindex = ti;
         }
       if (maxindex == -1 || topicDist[maxindex] < threshold) break;
       pw.print(maxindex + " " + topicDist[maxindex] + " ");
       topicDist[maxindex] = 0;
     }
     pw.println(' ');
   }
 }
  public static void main(String[] args) throws Exception {
    InstanceList instances = InstanceList.load(new File(args[0]));
    int numTopics = Integer.parseInt(args[1]);
    ParallelTopicModel model = new ParallelTopicModel(numTopics, 5.0, 0.01);
    model.addInstances(instances);
    model.setNumIterations(1000);

    model.estimate();

    TopicModelDiagnostics diagnostics = new TopicModelDiagnostics(model, 20);

    if (args.length == 3) {
      PrintWriter out = new PrintWriter(args[2]);
      out.println(diagnostics.toXML());
      out.close();
    }
  }
예제 #3
0
  public TestCRFPipe(String trainingFilename) throws IOException {

    ArrayList<Pipe> pipes = new ArrayList<Pipe>();

    PrintWriter out = new PrintWriter("test.out");

    int[][] conjunctions = new int[3][];
    conjunctions[0] = new int[] {-1};
    conjunctions[1] = new int[] {1};
    conjunctions[2] = new int[] {-2, -1};

    pipes.add(new SimpleTaggerSentence2TokenSequence());
    // pipes.add(new FeaturesInWindow("PREV-", -1, 1));
    // pipes.add(new FeaturesInWindow("NEXT-", 1, 2));
    pipes.add(new OffsetConjunctions(conjunctions));
    pipes.add(new TokenTextCharSuffix("C1=", 1));
    pipes.add(new TokenTextCharSuffix("C2=", 2));
    pipes.add(new TokenTextCharSuffix("C3=", 3));
    pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*")));
    pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*")));
    pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*")));
    pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile("\\$.*")));
    pipes.add(new TokenFirstPosition("FIRSTTOKEN"));
    pipes.add(new TokenSequence2FeatureVectorSequence());
    pipes.add(new SequencePrintingPipe(out));

    Pipe pipe = new SerialPipes(pipes);

    InstanceList trainingInstances = new InstanceList(pipe);

    trainingInstances.addThruPipe(
        new LineGroupIterator(
            new BufferedReader(
                new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))),
            Pattern.compile("^\\s*$"),
            true));

    out.close();
  }
예제 #4
0
파일: LDA.java 프로젝트: shamsa-abid/habiba
 public void printState(PrintWriter pw) {
   Alphabet a = ilist.getDataAlphabet();
   pw.println("#doc pos typeindex type topic");
   for (int di = 0; di < topics.length; di++) {
     FeatureSequence fs = (FeatureSequence) ilist.get(di).getData();
     for (int si = 0; si < topics[di].length; si++) {
       int type = fs.getIndexAtPosition(si);
       pw.print(di);
       pw.print(' ');
       pw.print(si);
       pw.print(' ');
       pw.print(type);
       pw.print(' ');
       pw.print(a.lookupObject(type));
       pw.print(' ');
       pw.print(topics[di][si]);
       pw.println();
     }
   }
 }
예제 #5
0
파일: LDA.java 프로젝트: shamsa-abid/habiba
 public void printState(File f) throws IOException {
   PrintWriter writer = new PrintWriter(new FileWriter(f));
   printState(writer);
   writer.close();
 }
예제 #6
0
 public void printState(PrintWriter pw) {
   pw.println("#doc pos typeindex type bigrampossible? topic bigram");
   for (int di = 0; di < topics.length; di++) {
     FeatureSequenceWithBigrams fs = (FeatureSequenceWithBigrams) ilist.get(di).getData();
     for (int si = 0; si < topics[di].length; si++) {
       int type = fs.getIndexAtPosition(si);
       pw.print(di);
       pw.print(' ');
       pw.print(si);
       pw.print(' ');
       pw.print(type);
       pw.print(' ');
       pw.print(uniAlphabet.lookupObject(type));
       pw.print(' ');
       pw.print(fs.getBiIndexAtPosition(si) == -1 ? 0 : 1);
       pw.print(' ');
       pw.print(topics[di][si]);
       pw.print(' ');
       pw.print(grams[di][si]);
       pw.println();
     }
   }
 }