private InstanceList readFile() throws IOException { String NL = System.getProperty("line.separator"); Scanner scanner = new Scanner(new FileInputStream(fileName), encoding); ArrayList<Pipe> pipeList = new ArrayList<Pipe>(); pipeList.add(new CharSequence2TokenSequence(Pattern.compile("\\p{L}\\p{L}+"))); pipeList.add(new TokenSequence2FeatureSequence()); InstanceList testing = new InstanceList(new SerialPipes(pipeList)); try { while (scanner.hasNextLine()) { String text = scanner.nextLine(); text = text.replaceAll("\\x0d", ""); Pattern patten = Pattern.compile("^(.*?),(.*?),(.*)$"); Matcher matcher = patten.matcher(text); if (matcher.find()) { docIds.add(matcher.group(1)); testing.addThruPipe(new Instance(matcher.group(3), null, "test instance", null)); } } } finally { scanner.close(); } return testing; }
@Override public void process(JCas aJCas) throws AnalysisEngineProcessException { DocumentMetaData metadata = DocumentMetaData.get(aJCas); try { for (TokenSequence ts : generateTokenSequences(aJCas)) { instanceList.addThruPipe( new Instance(ts, NONE_LABEL, metadata.getDocumentId(), metadata.getDocumentUri())); } } catch (FeaturePathException e) { throw new AnalysisEngineProcessException(e); } }
public void addDocument(String key, String body, String title) { // this.docContent.add(body); list.addThruPipe(new Instance(body, key, title, null)); // Document doc = new Document(this.ts,-1,title,body); // this.documents.add(doc); this.documents.add(body); // return doc; }
public InstanceList processDocs(Collection<ConnectionsDocument> docsToRun) { InstanceList instanceList = new InstanceList(new Noop()); int count = 0; for (ConnectionsDocument doc : docsToRun) { if (count > 11) { log.info("Stopping at " + (count - 1) + " documents"); break; } Instance instance = new Instance(doc, null, null, null); instanceList.addThruPipe(instance); // count++; } pipes.addFirst(new Sentence2TokenSequence(skipAbbrev, inputTokenSet)); pipes.addFirst(new DocumentToSentencesPipe(inputTokenSet)); SerialPipes pipeSerial = new SerialPipes(pipes); InstanceList instanceListx = new InstanceList(pipeSerial); instanceListx.addThruPipe(instanceList.iterator()); instanceList = instanceListx; return instanceList; }
public InstanceList readArray(String[] cleanTexts) { StringArrayIterator iterator = new StringArrayIterator(cleanTexts); // Construct a new instance list, passing it the pipe we want to use to // process instances. InstanceList instances = new InstanceList(pipe); int index = 0; for (Instance inst : instances) { inst.setName(name_id.get(index)); inst.setTarget("english"); index++; } // Now process each instance provided by the iterator. instances.addThruPipe(iterator); return instances; }
// in the training feature table // Lines should be formatted as: // // [name] [label] [data ... ] // public static Classifier TrainMaxent(String trainingFilename, File modelFile) throws IOException { // build data input pipe ArrayList<Pipe> pipes = new ArrayList<Pipe>(); // define pipe // the features in [data ...] should like: feature:value pipes.add(new Target2Label()); pipes.add(new Csv2FeatureVector()); Pipe pipe = new SerialPipes(pipes); pipe.setTargetProcessing(true); // read data InstanceList trainingInstances = new InstanceList(pipe); FileReader training_file_reader = new FileReader(trainingFilename); CsvIterator reader = new CsvIterator( training_file_reader, "(\\w+)\\s+([^\\s]+)\\s+(.*)", 3, 2, 1); // (data, label, name) field indices trainingInstances.addThruPipe(reader); training_file_reader.close(); // calculate running time long startTime = System.currentTimeMillis(); PrintStream temp = System.err; System.setErr(System.out); // train a Maxent classifier (could be other classifiers) ClassifierTrainer trainer = new MaxEntTrainer(Gaussian_Variance); Classifier classifier = trainer.train(trainingInstances); System.setErr(temp); // calculate running time long endTime = System.currentTimeMillis(); long totalTime = endTime - startTime; System.out.println("Total training time: " + totalTime); // write model ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(modelFile)); oos.writeObject(classifier); oos.close(); return classifier; }
private InstanceList generateInstanceList() throws Exception { ArrayList<Pipe> pipeList = new ArrayList<Pipe>(); pipeList.add(new CharSequence2TokenSequence(Pattern.compile("\\p{L}\\p{L}+"))); pipeList.add(new TokenSequence2FeatureSequence()); Reader fileReader = new InputStreamReader(new FileInputStream(new File(fileName)), "UTF-8"); InstanceList instances = new InstanceList(new SerialPipes(pipeList)); instances.addThruPipe( new CsvIterator( fileReader, Pattern.compile("^(\\S*)[\\s,]*(\\S*)[\\s,]*(.*)$"), 3, 2, 1)); // data, label, name fields return instances; }
/** * Prepare Instances for use with LDA. * * @param r * @return */ public static InstanceList loadInstancesLDA(Reader r) { ArrayList<Pipe> pipeList = new ArrayList<Pipe>(); // Pipes: lowercase, tokenize, remove stopwords, map to features pipeList.add(new Target2Label()); pipeList.add(new CharSequenceLowercase()); pipeList.add(new CharSequence2TokenSequence(Pattern.compile("\\p{L}[\\p{L}\\p{P}]+\\p{L}"))); pipeList.add( new TokenSequenceRemoveStopwords(stopWords, stopWordsEncoding, false, false, false)); pipeList.add(new TokenSequence2FeatureSequence()); SerialPipes pipes = new SerialPipes(pipeList); InstanceList instances = new InstanceList(pipes); // create instances with: 3: data; 2: label; 1: name fields instances.addThruPipe(new CsvIterator(r, Pattern.compile("(.*)\t(.*)\t(.*)"), 3, 2, 1)); return instances; }
public void test() throws Exception { ParallelTopicModel model = ParallelTopicModel.read(new File(inferencerFile)); TopicInferencer inferencer = model.getInferencer(); ArrayList<Pipe> pipeList = new ArrayList<Pipe>(); pipeList.add(new CharSequence2TokenSequence(Pattern.compile("\\p{L}\\p{L}+"))); pipeList.add(new TokenSequence2FeatureSequence()); InstanceList instances = new InstanceList(new SerialPipes(pipeList)); Reader fileReader = new InputStreamReader(new FileInputStream(new File(fileName)), "UTF-8"); instances.addThruPipe( new CsvIterator( fileReader, Pattern.compile("^(\\S*)[\\s,]*(\\S*)[\\s,]*(.*)$"), 3, 2, 1)); // data, label, name fields double[] testProbabilities = inferencer.getSampledDistribution(instances.get(1), 10, 1, 5); for (int i = 0; i < 1000; i++) System.out.println(i + ": " + testProbabilities[i]); }
public TestCRFPipe(String trainingFilename) throws IOException { ArrayList<Pipe> pipes = new ArrayList<Pipe>(); PrintWriter out = new PrintWriter("test.out"); int[][] conjunctions = new int[3][]; conjunctions[0] = new int[] {-1}; conjunctions[1] = new int[] {1}; conjunctions[2] = new int[] {-2, -1}; pipes.add(new SimpleTaggerSentence2TokenSequence()); // pipes.add(new FeaturesInWindow("PREV-", -1, 1)); // pipes.add(new FeaturesInWindow("NEXT-", 1, 2)); pipes.add(new OffsetConjunctions(conjunctions)); pipes.add(new TokenTextCharSuffix("C1=", 1)); pipes.add(new TokenTextCharSuffix("C2=", 2)); pipes.add(new TokenTextCharSuffix("C3=", 3)); pipes.add(new RegexMatches("CAPITALIZED", Pattern.compile("^\\p{Lu}.*"))); pipes.add(new RegexMatches("STARTSNUMBER", Pattern.compile("^[0-9].*"))); pipes.add(new RegexMatches("HYPHENATED", Pattern.compile(".*\\-.*"))); pipes.add(new RegexMatches("DOLLARSIGN", Pattern.compile("\\$.*"))); pipes.add(new TokenFirstPosition("FIRSTTOKEN")); pipes.add(new TokenSequence2FeatureVectorSequence()); pipes.add(new SequencePrintingPipe(out)); Pipe pipe = new SerialPipes(pipes); InstanceList trainingInstances = new InstanceList(pipe); trainingInstances.addThruPipe( new LineGroupIterator( new BufferedReader( new InputStreamReader(new GZIPInputStream(new FileInputStream(trainingFilename)))), Pattern.compile("^\\s*$"), true)); out.close(); }
private void addInstancesThroughPipe( PDTBRelation relation, Document document, int arg1Line, int arg2Line, InstanceList instanceList) { // System.out.println("Relation: " + relation.toString()); // System.out.println("arg1Line: " + arg1Line); // System.out.println("arg2Line: " + arg2Line); String connectiveGornAddress = relation.getConnectiveGornAddress(); Tree arg2Tree = document.getTree(arg2Line); List<Tree> connHeadLeaves = connAnalyzer.getConnHeadLeaves(arg2Tree, connectiveGornAddress, relation.getConnHead()); if (connHeadLeaves.isEmpty()) return; int connStart = treeAnalyzer.getLeafPosition(arg2Tree, connHeadLeaves.get(0)); int connEnd = treeAnalyzer.getLeafPosition(arg2Tree, connHeadLeaves.get(connHeadLeaves.size() - 1)); if ((connEnd - connStart) > 4) { // handle if..else, etc. connEnd = connStart; } // consider only the first sentence in case of multi-line argument1 String arg1GornAddress = relation.getArg1GornAddress(); Tree arg1Tree = document.getTree(arg1Line); List<Tree> arg1GornNodes = getArgGornNodes(arg1Tree, arg1Line, arg1GornAddress); Tree syntacticHead = headAnalyzer.getSyntacticHead(arg1Tree, arg1GornNodes); int arg1HeadPos = treeAnalyzer.getLeafPosition(arg1Tree, syntacticHead); String arg2GornAddress = relation.getArg2GornAddress(); List<Tree> arg2GornNodes = getArgGornNodes(arg2Tree, arg2Line, arg2GornAddress); Tree arg2SyntacticHead = headAnalyzer.getSyntacticHead(arg2Tree, arg2GornNodes); int arg2HeadPos = treeAnalyzer.getLeafPosition(arg2Tree, arg2SyntacticHead); if (arg2HeadPos == -1) { System.out.println("arg2Head == -1"); return; } if (arg1HeadPos == -1) { System.out.println("arg1Head == -1"); return; } int trueCandidate = -1; List<Pair<Integer, Integer>> candidates = getCandidates(document, arg2Line, connStart, connEnd, arg1Line); for (int i = 0; i < candidates.size(); i++) { Pair<Integer, Integer> candidate = candidates.get(i); if (candidate.first() == arg1Line && candidate.second() == arg1HeadPos) { trueCandidate = i; break; } } if (trueCandidate == -1) { // trueCandidate = candidates.size(); // candidates.add(new Pair<Integer, Integer>(arg1Line, arg1HeadPos)); // System.out.println("Covered!"); System.out.println("true candidate == -1!!!"); System.out.println(syntacticHead.value()); } else { int extractArg2 = ARG2_EXTRACTOR.extractArg2( document.getSentence(arg2Line), document.getTree(arg2Line), document.getDepGraph(arg2Line), connStart, connEnd); if (extractArg2 == -1) { extractArg2 = 0; System.out.println("Arg2 == -1!!!!!!!!!!!!!!!!!"); } // Arg1RankInstance instance = new Arg1RankInstance(document, candidates, arg2Line, // extractArg2, connStart, connEnd, trueCandidate); Arg1RankInstance instance = new Arg1RankInstance( document, candidates, arg2Line, arg2HeadPos, connStart, connEnd, trueCandidate); instanceList.addThruPipe(instance); } }
/** * @param targetTerm * @param sourceFile * @param termWindowSize * @param pipe */ private static InstanceList readConcordanceFileToInstanceList( String targetTerm, String sourceFile, int termWindowSize, Pipe pipe, boolean useCollocationalVector) { InstanceList instanceList = new InstanceList(pipe); BufferedReader in = null; try { in = new BufferedReader(new FileReader(sourceFile)); int incomplete = 0; String str; while ((str = in.readLine()) != null) { String[] lineArray = str.split(";"); if (lineArray.length != 4) { System.out.println( "WARNING: Skipping possibly invalid CSV line " + str + " in file " + sourceFile); continue; } String docID = lineArray[0].replace("Doc ID: ", "").trim(); String lineID = lineArray[1].replace("Line ID: ", "").trim(); String instanceID = (docID + "_" + lineID).replaceAll(" ", "_"); String senseID = lineArray[2].replace("DOE sense ID: ", "").trim(); String text = lineArray[3]; if (targetTerm.equals("faeder")) targetTerm = "fæder"; ArrayList<String> data = corpus.getWindowTokens(targetTerm, docID, lineID, termWindowSize); if (data.size() != 2 * termWindowSize) { incomplete++; System.out.println("WARNING: Incomplete token list " + incomplete + " found " + data); } if (useCollocationalVector) { System.out.println("Converting data to collocational vector: \n\t" + data); int i = termWindowSize * (-1); int index = i + termWindowSize; while (i <= termWindowSize && index < data.size()) { if (i != 0) { data.set(index, data.get(index) + "_" + i); // skip position of target term index++; } i++; } System.out.println("Converting data to collocational vector...DONE\n\t" + data); } String dataStr = data.toString().replace(", ", " ").replace("[", "").replace("]", "").replace(".", ""); Instance trainingInstance = new Instance(dataStr, senseID, instanceID, text); instanceList.addThruPipe(trainingInstance); } in.close(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { if (in != null) try { in.close(); } catch (IOException e1) { } } return instanceList; }
public InstanceList createFeatureData( final ArrayList<Sentence> sentences, final Properties featureConfig) { final FeatureConfiguration fc = new FeatureConfiguration(); final ArrayList<Pipe> pipeParam = new ArrayList<Pipe>(); // base pipe pipeParam.add(new BasePipe(featureConfig)); // default surface patterns pipeParam.add( new RegexMatches( "INITLOWCAPS_ANYTHING_NONUMBER", Pattern.compile("[" + UNICODE_LOWER + "][" + UNICODE_UPPER + "][^0-9]*"))); pipeParam.add( new RegexMatches( "INITLOWCAPS_ANYTHING_WITHNUMBER", Pattern.compile("[" + UNICODE_LOWER + "][" + UNICODE_UPPER + "].*[0-9].*"))); pipeParam.add(new RegexMatches("INITCAPS", Pattern.compile("[" + UNICODE_UPPER + "].*"))); pipeParam.add( new RegexMatches( "INITCAPSALPHA", Pattern.compile("[" + UNICODE_UPPER + "][" + UNICODE_LOWER + "].*"))); pipeParam.add(new RegexMatches("ALLCAPS", Pattern.compile("[" + UNICODE_UPPER + "]+"))); pipeParam.add( new RegexMatches("CAPSMIX", Pattern.compile("[" + UNICODE_UPPER + UNICODE_LOWER + "]+"))); pipeParam.add(new RegexMatches("HASDIGIT", Pattern.compile(".*[0-9].*"))); pipeParam.add(new RegexMatches("SINGLEDIGIT", Pattern.compile("[0-9]"))); pipeParam.add(new RegexMatches("DOUBLEDIGIT", Pattern.compile("[0-9][0-9]"))); pipeParam.add(new RegexMatches("NATURALNUMBER", Pattern.compile("[0-9]+"))); pipeParam.add(new RegexMatches("REALNUMBER", Pattern.compile("[-0-9]+[.,]+[0-9.,]+"))); pipeParam.add(new RegexMatches("HASDASH", Pattern.compile(".*-.*"))); pipeParam.add(new RegexMatches("INITDASH", Pattern.compile("-.*"))); pipeParam.add(new RegexMatches("ENDDASH", Pattern.compile(".*-"))); pipeParam.add( new RegexMatches( "ALPHANUMERIC", Pattern.compile(".*[" + UNICODE_UPPER + UNICODE_LOWER + "].*[0-9].*"))); pipeParam.add( new RegexMatches( "ALPHANUMERIC", Pattern.compile(".*[0-9].*[" + UNICODE_UPPER + UNICODE_LOWER + "].*"))); pipeParam.add(new RegexMatches("IS_PUNCTUATION_MARK", Pattern.compile("[,.;:?!]"))); pipeParam.add(new RegexMatches("IS_MINUSDASHSLASH", Pattern.compile("[-_/]"))); // bio surface patterns if (fc.featureActive(featureConfig, "feat_bioregexp_enabled")) { pipeParam.add(new RegexMatches("ROMAN", Pattern.compile("[IVXDLCM]+"))); pipeParam.add(new RegexMatches("HASROMAN", Pattern.compile(".*\\b[IVXDLCM]+\\b.*"))); pipeParam.add(new RegexMatches("GREEK", Pattern.compile(GREEK))); pipeParam.add(new RegexMatches("HASGREEK", Pattern.compile(".*\\b" + GREEK + "\\b.*"))); } // prefix and suffix final int[] prefixSizes = fc.getIntArray(featureConfig, "prefix_sizes"); if (prefixSizes != null) for (final int prefixSize : prefixSizes) pipeParam.add(new TokenTextCharPrefix("PREFIX=", prefixSize)); final int[] suffixSizes = fc.getIntArray(featureConfig, "suffix_sizes"); if (suffixSizes != null) for (final int suffixSize : suffixSizes) pipeParam.add(new TokenTextCharSuffix("SUFFIX=", suffixSize)); // lexicon membership for (final String key : fc.getLexiconKeys(featureConfig)) { final File lexFile = new File(featureConfig.getProperty(key)); try { pipeParam.add(new LexiconMembership(key + "_membership", lexFile, true)); } catch (final FileNotFoundException e) { e.printStackTrace(); } } // offset conjunction final int[][] offset = fc.offsetConjFromConfig(featureConfig.getProperty("offset_conjunctions")); if (offset != null) pipeParam.add(new OffsetConjunctions(offset)); // token ngrams final int[] tokenNGrams = fc.getIntArray(featureConfig, "token_ngrams"); if (tokenNGrams != null) pipeParam.add(new TokenNGramPipe(tokenNGrams)); // character ngrams final int[] charNGrams = fc.getIntArray(featureConfig, "char_ngrams"); if (charNGrams != null) pipeParam.add(new TokenTextCharNGrams("CHAR_NGRAM=", charNGrams)); // un-comment this for printing out the generated features // pipeParam.add(new PrintTokenSequenceFeatures()); pipeParam.add(new TokenSequence2FeatureVectorSequence(true, true)); final Pipe[] pipeParamArray = new Pipe[pipeParam.size()]; pipeParam.toArray(pipeParamArray); final Pipe myPipe = new SerialPipes(pipeParamArray); // TODO; removed for mallet-2 as not needed // myPipe.setTargetAlphabet(dict); // now run data through pipes final InstanceList data = new InstanceList(myPipe); final SentencePipeIterator iterator = new SentencePipeIterator(sentences); data.addThruPipe(iterator); return data; }
/** * Command-line wrapper to train, test, or run a generic CRF-based tagger. * * @param args the command line arguments. Options (shell and Java quoting should be added as * needed): * <dl> * <dt><code>--help</code> <em>boolean</em> * <dd>Print this command line option usage information. Give <code>true</code> for longer * documentation. Default is <code>false</code>. * <dt><code>--prefix-code</code> <em>Java-code</em> * <dd>Java code you want run before any other interpreted code. Note that the text is * interpreted without modification, so unlike some other Java code options, you need to * include any necessary 'new's. Default is null. * <dt><code>--gaussian-variance</code> <em>positive-number</em> * <dd>The Gaussian prior variance used for training. Default is 10.0. * <dt><code>--train</code> <em>boolean</em> * <dd>Whether to train. Default is <code>false</code>. * <dt><code>--iterations</code> <em>positive-integer</em> * <dd>Number of training iterations. Default is 500. * <dt><code>--test</code> <code>lab</code> or <code>seg=</code><em>start-1</em><code>. * </code><em>continue-1</em><code>,</code>...<code>,</code><em>start-n</em><code>. * </code><em>continue-n</em> * <dd>Test measuring labeling or segmentation (<em>start-i</em>, <em>continue-i</em>) * accuracy. Default is no testing. * <dt><code>--training-proportion</code> <em>number-between-0-and-1</em> * <dd>Fraction of data to use for training in a random split. Default is 0.5. * <dt><code>--model-file</code> <em>filename</em> * <dd>The filename for reading (train/run) or saving (train) the model. Default is null. * <dt><code>--random-seed</code> <em>integer</em> * <dd>The random seed for randomly selecting a proportion of the instance list for training * Default is 0. * <dt><code>--orders</code> <em>comma-separated-integers</em> * <dd>List of label Markov orders (main and backoff) Default is 1. * <dt><code>--forbidden</code> <em>regular-expression</em> * <dd>If <em>label-1</em><code>,</code><em>label-2</em> matches the expression, the * corresponding transition is forbidden. Default is <code>\\s</code> (nothing * forbidden). * <dt><code>--allowed</code> <em>regular-expression</em> * <dd>If <em>label-1</em><code>,</code><em>label-2</em> does not match the expression, the * corresponding expression is forbidden. Default is <code>.*</code> (everything * allowed). * <dt><code>--default-label</code> <em>string</em> * <dd>Label for initial context and uninteresting tokens. Default is <code>O</code>. * <dt><code>--viterbi-output</code> <em>boolean</em> * <dd>Print Viterbi periodically during training. Default is <code>false</code>. * <dt><code>--fully-connected</code> <em>boolean</em> * <dd>Include all allowed transitions, even those not in training data. Default is <code> * true</code>. * <dt><code>--n-best</code> <em>positive-integer</em> * <dd>Number of answers to output when applying model. Default is 1. * <dt><code>--include-input</code> <em>boolean</em> * <dd>Whether to include input features when printing decoding output. Default is <code> * false</code>. * </dl> * Remaining arguments: * <ul> * <li><em>training-data-file</em> if training * <li><em>training-and-test-data-file</em>, if training and testing with random split * <li><em>training-data-file</em> <em>test-data-file</em> if training and testing from * separate files * <li><em>test-data-file</em> if testing * <li><em>input-data-file</em> if applying to new data (unlabeled) * </ul> * * @exception Exception if an error occurs */ public static void main(String[] args) throws Exception { Reader trainingFile = null, testFile = null; InstanceList trainingData = null, testData = null; int numEvaluations = 0; int iterationsBetweenEvals = 16; int restArgs = commandOptions.processOptions(args); if (restArgs == args.length) { commandOptions.printUsage(true); throw new IllegalArgumentException("Missing data file(s)"); } if (trainOption.value) { trainingFile = new FileReader(new File(args[restArgs])); if (testOption.value != null && restArgs < args.length - 1) testFile = new FileReader(new File(args[restArgs + 1])); } else testFile = new FileReader(new File(args[restArgs])); Pipe p = null; CRF crf = null; TransducerEvaluator eval = null; if (continueTrainingOption.value || !trainOption.value) { if (modelOption.value == null) { commandOptions.printUsage(true); throw new IllegalArgumentException("Missing model file option"); } ObjectInputStream s = new ObjectInputStream(new FileInputStream(modelOption.value)); crf = (CRF) s.readObject(); s.close(); p = crf.getInputPipe(); } else { p = new SimpleTaggerSentence2FeatureVectorSequence(); p.getTargetAlphabet().lookupIndex(defaultOption.value); } if (trainOption.value) { p.setTargetProcessing(true); trainingData = new InstanceList(p); trainingData.addThruPipe( new LineGroupIterator(trainingFile, Pattern.compile("^\\s*$"), true)); logger.info("Number of features in training data: " + p.getDataAlphabet().size()); if (testOption.value != null) { if (testFile != null) { testData = new InstanceList(p); testData.addThruPipe(new LineGroupIterator(testFile, Pattern.compile("^\\s*$"), true)); } else { Random r = new Random(randomSeedOption.value); InstanceList[] trainingLists = trainingData.split( r, new double[] {trainingFractionOption.value, 1 - trainingFractionOption.value}); trainingData = trainingLists[0]; testData = trainingLists[1]; } } } else if (testOption.value != null) { p.setTargetProcessing(true); testData = new InstanceList(p); testData.addThruPipe(new LineGroupIterator(testFile, Pattern.compile("^\\s*$"), true)); } else { p.setTargetProcessing(false); testData = new InstanceList(p); testData.addThruPipe(new LineGroupIterator(testFile, Pattern.compile("^\\s*$"), true)); } logger.info("Number of predicates: " + p.getDataAlphabet().size()); if (testOption.value != null) { if (testOption.value.startsWith("lab")) eval = new TokenAccuracyEvaluator( new InstanceList[] {trainingData, testData}, new String[] {"Training", "Testing"}); else if (testOption.value.startsWith("seg=")) { String[] pairs = testOption.value.substring(4).split(","); if (pairs.length < 1) { commandOptions.printUsage(true); throw new IllegalArgumentException( "Missing segment start/continue labels: " + testOption.value); } String startTags[] = new String[pairs.length]; String continueTags[] = new String[pairs.length]; for (int i = 0; i < pairs.length; i++) { String[] pair = pairs[i].split("\\."); if (pair.length != 2) { commandOptions.printUsage(true); throw new IllegalArgumentException( "Incorrectly-specified segment start and end labels: " + pairs[i]); } startTags[i] = pair[0]; continueTags[i] = pair[1]; } eval = new MultiSegmentationEvaluator( new InstanceList[] {trainingData, testData}, new String[] {"Training", "Testing"}, startTags, continueTags); } else { commandOptions.printUsage(true); throw new IllegalArgumentException("Invalid test option: " + testOption.value); } } if (p.isTargetProcessing()) { Alphabet targets = p.getTargetAlphabet(); StringBuffer buf = new StringBuffer("Labels:"); for (int i = 0; i < targets.size(); i++) buf.append(" ").append(targets.lookupObject(i).toString()); logger.info(buf.toString()); } if (trainOption.value) { crf = train( trainingData, testData, eval, ordersOption.value, defaultOption.value, forbiddenOption.value, allowedOption.value, connectedOption.value, iterationsOption.value, gaussianVarianceOption.value, crf); if (modelOption.value != null) { ObjectOutputStream s = new ObjectOutputStream(new FileOutputStream(modelOption.value)); s.writeObject(crf); s.close(); } } else { if (crf == null) { if (modelOption.value == null) { commandOptions.printUsage(true); throw new IllegalArgumentException("Missing model file option"); } ObjectInputStream s = new ObjectInputStream(new FileInputStream(modelOption.value)); crf = (CRF) s.readObject(); s.close(); } if (eval != null) test(new NoopTransducerTrainer(crf), eval, testData); else { boolean includeInput = includeInputOption.value(); for (int i = 0; i < testData.size(); i++) { Sequence input = (Sequence) testData.get(i).getData(); Sequence[] outputs = apply(crf, input, nBestOption.value); int k = outputs.length; boolean error = false; for (int a = 0; a < k; a++) { if (outputs[a].size() != input.size()) { System.err.println("Failed to decode input sequence " + i + ", answer " + a); error = true; } } if (!error) { for (int j = 0; j < input.size(); j++) { StringBuffer buf = new StringBuffer(); for (int a = 0; a < k; a++) buf.append(outputs[a].get(j).toString()).append(" "); if (includeInput) { FeatureVector fv = (FeatureVector) input.get(j); buf.append(fv.toString(true)); } System.out.println(buf.toString()); } System.out.println(); } } } } }