/** * train the tagger using the DocumentCollection in file 'trainingCollection'. * 'trainingCollection' should consist of documents which have been explicitly tagged with * part-of-speech information. */ void train(String trainingCollection) { for (int i = 0; i < posTable.length; i++) tagTable[i] = new String[] {"constit", "cat", posTable[i], posTable[i]}; // build ergodic HMM with one state for each POS (plus start and end states) HMMstate startState = new HMMstate("start", "", WordFeatureHMMemitter.class); posh.addState(startState); for (int j = 0; j < posTable.length; j++) startState.addArc(new HMMarc(posTable[j], 0)); HMMstate endState = new HMMstate("end", "", WordFeatureHMMemitter.class); posh.addState(endState); for (int i = 0; i < posTable.length; i++) { String pos = posTable[i]; HMMstate state = new HMMstate(pos, pos, WordFeatureHMMemitter.class); posh.addState(state); for (int j = 0; j < posTable.length; j++) state.addArc(new HMMarc(posTable[j], 0)); state.addArc(new HMMarc("end", 0)); } posh.resolveNames(); posh.resetForTraining(); annotator = new HMMannotator(posh); annotator.setTagTable(tagTable); annotator.setBItag(false); DocumentCollection col = new DocumentCollection(trainingCollection); col.open(); for (int i = 0; i < col.size(); i++) { ExternalDocument doc = col.get(i); doc.open(); System.out.println("Training from " + doc.fileName()); // divide at endmarks (constit cat="."), adding "S" marks int posn = 0; int start = posn; Vector anns; while ((anns = doc.annotationsAt(posn, "constit")) != null) { Annotation ann = (Annotation) anns.get(0); posn = ann.span().end(); String pos = (String) ann.get("cat"); if (pos.equals(".")) { doc.annotate("S", new Span(start, posn), new FeatureSet()); start = posn; } } annotator.train(doc); // free up space taken by annotations on document doc.clearAnnotations(); } posh.computeProbabilities(); }
public void run() { List<TestFailure> failures = dto.loadFailures(); DocumentCollection documents = DocumentCollection.getInstance(); TermCollection terms = TermCollection.getInstance(); TestFailureUtil.buildDocumentCollection(failures, documents); TestFailureUtil.buildTermCollection(failures, terms); for (int i = 0; i < terms.size(); i++) { String term = terms.get(i); Integer count = documents.findTermFrequency(term); // Calculate idf weight against a term Float weight = (float) Math.log((double) documents.size() / (double) count); terms.setWeithg(term, weight); // System.out.println(String.valueOf(i) + "." + term + " : " + String.valueOf(weight)) ; } for (int i = 0; i < documents.size(); i++) { String document = documents.get(i); String[] words = document.split("\\s+"); System.out.println(document); for (String word : words) { TermItem item = new TermItem(); float tfidf = VectorSpaceModel.findTFIDF(documents.get(i), word); item.setTerm(word); item.setTfidf(tfidf); Integer pos = -1; while ((pos = document.indexOf(word, pos + 1)) >= 0) { item.addPos(pos); } if (tfidf > 0.0) System.out.println(item.toString()); } } }