public static void main(String[] args) { List<Revision> revisions = new ArrayList<Revision>(); List<TestDocument> documents = new ArrayList<TestDocument>(); List<TestDocument> allDocuments = new ArrayList<TestDocument>(); // read in the train and test data System.out.println("reading in train data!"); XMLParser parser = new XMLParser(); File dir = new File(TRAIN_DIR + USER); for (File file : dir.listFiles()) { // System.err.println(file); Revision revision = parser.parseRevision(file); if (revision != null) revisions.add(revision); } System.out.println("reading in test data!"); List<String> documentsToUse = null; try { documentsToUse = QueryGetter.getNMostRecentlyEditedPageIds(USER, 50); } catch (Exception e) { e.printStackTrace(); } dir = new File(TEST_DIR); for (File file : dir.listFiles()) { String name = file.getName(); int dash = name.indexOf("-"); int dot = name.indexOf("."); String id = name.substring(0, dash); if (!name.substring(dash + 1, dot).equals(USER) || documentsToUse.indexOf(id) == -1) continue; TestDocument document = parser.parseDocument(file); if (document.paragraphs.size() == 0) continue; // if (document.paragraphs.size() < 10) continue; document.id = id; // if (documentsToUse.indexOf(id) < 25 && documents.size() < 10) if (documentsToUse.indexOf(id) < 10) // if (documents.size() < 10) documents.add(document); allDocuments.add(document); } // prepare test data System.out.println("calculating scores!"); for (TestDocument document : allDocuments) calculateScores(document, revisions); // train + test using leave-one-out cross-validation System.out.println("testing!"); MultinomialNaiveBayes classifier = new MultinomialNaiveBayes(); for (TestDocument testDocument : documents) { List<Revision> trainRevisions = new ArrayList<Revision>(revisions); for (Revision revision : revisions) { if (revision.pageId.equals(testDocument.id)) trainRevisions.remove(revision); } List<TestDocument> trainDocuments = new ArrayList<TestDocument>(); for (TestDocument document : allDocuments) { if (document.equals(testDocument)) continue; TestDocument newDocument = new TestDocument(); for (int i = 0; i < document.paragraphs.size(); i++) { if (document.scores.get(i) == 0) { newDocument.paragraphs.add(document.paragraphs.get(i)); newDocument.scores.add(document.scores.get(i)); } } trainDocuments.add(newDocument); } // classifier.countDocumentWords = true; classifier.calculateProbabilities(trainRevisions, trainDocuments); List<MultinomialNaiveBayes.Pair> predictions = classifier.mostLikelyParagraphs(testDocument); Set<String> revisedParagraphs = new HashSet<String>(); for (int i = 0; i < testDocument.paragraphs.size(); i++) { if (testDocument.scores.get(i) > 0) revisedParagraphs.add(testDocument.paragraphs.get(i)); } System.out.println("size: " + revisedParagraphs.size()); System.out.println("num paragraphs: " + predictions.size()); int rank = -1; boolean done = false; for (int i = 0; i < predictions.size(); i++) { // System.out.println("~~~PREDICTION" + i + "~~~\n"); // System.out.println(predictions.get(i).paragraph); // System.out.println("score: " + predictions.get(i).score); if (revisedParagraphs.contains(predictions.get(i).paragraph)) { rank = i; done = true; } if (done) break; } System.out.println("rank: " + rank); // System.out.println("PREDICTED:\n" + predictions.get(0).paragraph); // System.out.println("ACTUAL:\n" + actual); } }