private InstanceList readFile() throws IOException { String NL = System.getProperty("line.separator"); Scanner scanner = new Scanner(new FileInputStream(fileName), encoding); ArrayList<Pipe> pipeList = new ArrayList<Pipe>(); pipeList.add(new CharSequence2TokenSequence(Pattern.compile("\\p{L}\\p{L}+"))); pipeList.add(new TokenSequence2FeatureSequence()); InstanceList testing = new InstanceList(new SerialPipes(pipeList)); try { while (scanner.hasNextLine()) { String text = scanner.nextLine(); text = text.replaceAll("\\x0d", ""); Pattern patten = Pattern.compile("^(.*?),(.*?),(.*)$"); Matcher matcher = patten.matcher(text); if (matcher.find()) { docIds.add(matcher.group(1)); testing.addThruPipe(new Instance(matcher.group(3), null, "test instance", null)); } } } finally { scanner.close(); } return testing; }
private InstanceList generateInstanceList() throws Exception { ArrayList<Pipe> pipeList = new ArrayList<Pipe>(); pipeList.add(new CharSequence2TokenSequence(Pattern.compile("\\p{L}\\p{L}+"))); pipeList.add(new TokenSequence2FeatureSequence()); Reader fileReader = new InputStreamReader(new FileInputStream(new File(fileName)), "UTF-8"); InstanceList instances = new InstanceList(new SerialPipes(pipeList)); instances.addThruPipe( new CsvIterator( fileReader, Pattern.compile("^(\\S*)[\\s,]*(\\S*)[\\s,]*(.*)$"), 3, 2, 1)); // data, label, name fields return instances; }
public void doInference() { try { ParallelTopicModel model = ParallelTopicModel.read(new File(inferencerFile)); TopicInferencer inferencer = model.getInferencer(); // TopicInferencer inferencer = // TopicInferencer.read(new File(inferencerFile)); // InstanceList testing = readFile(); readFile(); InstanceList testing = generateInstanceList(); // readFile(); for (int i = 0; i < testing.size(); i++) { StringBuilder probabilities = new StringBuilder(); double[] testProbabilities = inferencer.getSampledDistribution(testing.get(i), 10, 1, 5); ArrayList probabilityList = new ArrayList(); for (int j = 0; j < testProbabilities.length; j++) { probabilityList.add(new Pair<Integer, Double>(j, testProbabilities[j])); } Collections.sort(probabilityList, new CustomComparator()); for (int j = 0; j < testProbabilities.length && j < topN; j++) { if (j > 0) probabilities.append(" "); probabilities.append( ((Pair<Integer, Double>) probabilityList.get(j)).getFirst().toString() + "," + ((Pair<Integer, Double>) probabilityList.get(j)).getSecond().toString()); } System.out.println(docIds.get(i) + "," + probabilities.toString()); } } catch (Exception e) { e.printStackTrace(); System.err.println(e.getMessage()); } }
public void test() throws Exception { ParallelTopicModel model = ParallelTopicModel.read(new File(inferencerFile)); TopicInferencer inferencer = model.getInferencer(); ArrayList<Pipe> pipeList = new ArrayList<Pipe>(); pipeList.add(new CharSequence2TokenSequence(Pattern.compile("\\p{L}\\p{L}+"))); pipeList.add(new TokenSequence2FeatureSequence()); InstanceList instances = new InstanceList(new SerialPipes(pipeList)); Reader fileReader = new InputStreamReader(new FileInputStream(new File(fileName)), "UTF-8"); instances.addThruPipe( new CsvIterator( fileReader, Pattern.compile("^(\\S*)[\\s,]*(\\S*)[\\s,]*(.*)$"), 3, 2, 1)); // data, label, name fields double[] testProbabilities = inferencer.getSampledDistribution(instances.get(1), 10, 1, 5); for (int i = 0; i < 1000; i++) System.out.println(i + ": " + testProbabilities[i]); }
public TestCRFPipe(String trainingFilename) throws IOException { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); PrintWriter out = new PrintWriter("test.out"); int[][] conjunctions = new int[3][]; conjunctions[0] = new int[] {-1}; conjunctions[1] = new int[] {1}; conjunctions[2] = new int[] {-2, -1}; pipes.add(new SimpleTaggerSentence2TokenSequence()); // pipes.add(new FeaturesInWindow("PREV-", -1, 1)); // pipes.add(new FeaturesInWindow("NEXT-", 1, 2)); pipes.add(new OffsetConjunctions(conjunctions)); pipes.add(new TokenTextCharSuffix("C1=", 1)); pipes.add(new TokenTextCharSuffix("C2=", 2)); pipes.add(new TokenTextCharSuffix("C3=", 3)); pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*"))); pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*"))); pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*"))); pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile("\\$.*"))); pipes.add(new TokenFirstPosition("FIRSTTOKEN")); pipes.add(new TokenSequence2FeatureVectorSequence()); pipes.add(new SequencePrintingPipe(out)); Pipe pipe = new SerialPipes(pipes); InstanceList trainingInstances = new InstanceList(pipe); trainingInstances.addThruPipe( new LineGroupIterator( new BufferedReader( new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); out.close(); }
public static void main(String[] args) throws Exception { InstanceList instances = InstanceList.load(new File(args[0])); int numTopics = Integer.parseInt(args[1]); ParallelTopicModel model = new ParallelTopicModel(numTopics, 5.0, 0.01); model.addInstances(instances); model.setNumIterations(1000); model.estimate(); TopicModelDiagnostics diagnostics = new TopicModelDiagnostics(model, 20); if (args.length == 3) { PrintWriter out = new PrintWriter(args[2]); out.println(diagnostics.toXML()); out.close(); } }