public Classification classify(Instance instance) { FeatureVector fv = (FeatureVector) instance.getData(); assert (instancePipe == null || fv.getAlphabet() == this.instancePipe.getDataAlphabet()); Node leaf = getLeaf(m_root, fv); return new Classification(instance, this, leaf.getGainRatio().getBaseLabelDistribution()); }
public static InstanceList copy(InstanceList instances) { InstanceList ret = (InstanceList) instances.clone(); // LabelAlphabet labelDict = (LabelAlphabet) ret.getTargetAlphabet(); Alphabet featDict = ret.getDataAlphabet(); for (int i = 0; i < ret.size(); i++) { Instance instance = ret.get(i); Instance clone = (Instance) instance.clone(); FeatureVector fv = (FeatureVector) clone.getData(); int[] indices = fv.getIndices(); double[] values = fv.getValues(); int[] newIndices = new int[indices.length]; System.arraycopy(indices, 0, newIndices, 0, indices.length); double[] newValues = new double[indices.length]; System.arraycopy(values, 0, newValues, 0, indices.length); FeatureVector newFv = new FeatureVector(featDict, newIndices, newValues); Instance newInstance = new Instance(newFv, instance.getTarget(), instance.getName(), instance.getSource()); ret.set(i, newInstance); } return ret; }
public void split() { if (m_ilist == null) throw new IllegalStateException("Frozen. Cannot split."); int numLeftChildren = 0; boolean[] toLeftChild = new boolean[m_instIndices.length]; for (int i = 0; i < m_instIndices.length; i++) { Instance instance = m_ilist.get(m_instIndices[i]); FeatureVector fv = (FeatureVector) instance.getData(); if (fv.value(m_gainRatio.getMaxValuedIndex()) <= m_gainRatio.getMaxValuedThreshold()) { toLeftChild[i] = true; numLeftChildren++; } else toLeftChild[i] = false; } logger.info( "leftChild.size=" + numLeftChildren + " rightChild.size=" + (m_instIndices.length - numLeftChildren)); int[] leftIndices = new int[numLeftChildren]; int[] rightIndices = new int[m_instIndices.length - numLeftChildren]; int li = 0, ri = 0; for (int i = 0; i < m_instIndices.length; i++) { if (toLeftChild[i]) leftIndices[li++] = m_instIndices[i]; else rightIndices[ri++] = m_instIndices[i]; } m_leftChild = new Node(m_ilist, this, m_minNumInsts, leftIndices); m_rightChild = new Node(m_ilist, this, m_minNumInsts, rightIndices); }
private double dataLogProbability(Instance instance, int labelIndex) { FeatureVector fv = (FeatureVector) instance.getData(); int fvisize = fv.numLocations(); double logProb = 0; for (int fvi = 0; fvi < fvisize; fvi++) logProb += fv.valueAtLocation(fvi) * p[labelIndex].logProbability(fv.indexAtLocation(fvi)); return logProb; }
public void count() { TIntIntHashMap docCounts = new TIntIntHashMap(); int index = 0; if (instances.size() == 0) { logger.info("Instance list is empty"); return; } if (instances.get(0).getData() instanceof FeatureSequence) { for (Instance instance : instances) { FeatureSequence features = (FeatureSequence) instance.getData(); for (int i = 0; i < features.getLength(); i++) { docCounts.adjustOrPutValue(features.getIndexAtPosition(i), 1, 1); } int[] keys = docCounts.keys(); for (int i = 0; i < keys.length - 1; i++) { int feature = keys[i]; featureCounts[feature] += docCounts.get(feature); documentFrequencies[feature]++; } docCounts = new TIntIntHashMap(); index++; if (index % 1000 == 0) { System.err.println(index); } } } else if (instances.get(0).getData() instanceof FeatureVector) { for (Instance instance : instances) { FeatureVector features = (FeatureVector) instance.getData(); for (int location = 0; location < features.numLocations(); location++) { int feature = features.indexAtLocation(location); double value = features.valueAtLocation(location); documentFrequencies[feature]++; featureCounts[feature] += value; } index++; if (index % 1000 == 0) { System.err.println(index); } } } else { logger.info("Unsupported data class: " + instances.get(0).getData().getClass().getName()); } }
public static InstanceList scale(InstanceList trainingList, double lower, double upper) { InstanceList ret = copy(trainingList); Alphabet featDict = ret.getDataAlphabet(); double[] feat_max = new double[featDict.size()]; double[] feat_min = new double[featDict.size()]; for (int i = 0; i < feat_max.length; i++) { feat_max[i] = -Double.MAX_VALUE; feat_min[i] = Double.MAX_VALUE; } for (int i = 0; i < ret.size(); i++) { Instance inst = ret.get(i); FeatureVector fv = (FeatureVector) inst.getData(); for (int loc = 0; loc < fv.numLocations(); loc++) { int featId = fv.indexAtLocation(loc); double value = fv.valueAtLocation(loc); double maxValue = feat_max[featId]; double minValue = feat_min[featId]; double newMaxValue = Math.max(value, maxValue); double newMinValue = Math.min(value, minValue); feat_max[featId] = newMaxValue; feat_min[featId] = newMinValue; } } // double lower = -1; // double upper = 1; for (int i = 0; i < ret.size(); i++) { Instance inst = ret.get(i); FeatureVector fv = (FeatureVector) inst.getData(); for (int loc = 0; loc < fv.numLocations(); loc++) { int featId = fv.indexAtLocation(loc); double value = fv.valueAtLocation(loc); double maxValue = feat_max[featId]; double minValue = feat_min[featId]; double newValue = Double.NaN; if (maxValue == minValue) { newValue = value; } else if (value == minValue) { newValue = lower; } else if (value == maxValue) { newValue = upper; } else { newValue = lower + (upper - lower) * (value - minValue) / (maxValue - minValue); } fv.setValueAtLocation(loc, newValue); } } return ret; }
protected List<Double> serializeFv(FeatureVector fv) { List<Double> features = new ArrayList<>(); int numLocations = fv.numLocations(); int[] indices = fv.getIndices(); for (int index = 0; index < numLocations; index++) { int featureIndex = indices[index]; double value = fv.value(featureIndex); features.add(value); } return features; }
public Instance pipe(Instance carrier) { Sequence data = (Sequence) carrier.getData(); Sequence target = (Sequence) carrier.getTarget(); if (data.size() != target.size()) throw new IllegalArgumentException( "Trying to print into SimpleTagger format, where data and target lengths do not match\n" + "data.length = " + data.size() + ", target.length = " + target.size()); int N = data.size(); if (data instanceof TokenSequence) { throw new UnsupportedOperationException("Not yet implemented."); } else if (data instanceof FeatureVectorSequence) { FeatureVectorSequence fvs = (FeatureVectorSequence) data; Alphabet dict = (fvs.size() > 0) ? fvs.getFeatureVector(0).getAlphabet() : null; for (int i = 0; i < N; i++) { Object label = target.get(i); writer.print(label); FeatureVector fv = fvs.getFeatureVector(i); for (int loc = 0; loc < fv.numLocations(); loc++) { writer.print(' '); String fname = dict.lookupObject(fv.indexAtLocation(loc)).toString(); double value = fv.valueAtLocation(loc); // if (!Maths.almostEquals(value, 1.0)) { // throw new IllegalArgumentException ("Printing to SimpleTagger format: FeatureVector // not binary at time slice "+i+" fv:"+fv); // } writer.print(fname + String.valueOf(value)); } writer.println(); } } else { throw new IllegalArgumentException("Don't know how to print data of type " + data); } writer.println(); // writer.print(getDataAlphabet()); return carrier; }
public void addToken(FeatureVector fv) { int[] indices = fv.getIndices(); Alphabet dictionary = fv.getAlphabet(); int indicesLength = fv.numLocations(); for (int i = 0; i < indicesLength; i++) { String key = dictionary.lookupObject(indices[i]).toString(); // log.info( key ); if (!contextOnly || (contextOnly && (key.endsWith("/+1") || key.endsWith("/-1") || key.endsWith("/-2") || key.endsWith("/+2")))) { map.increment(key); } } }
/** * Classify an instance using NaiveBayes according to the trained data. The alphabet of the * featureVector of the instance must match the alphabe of the pipe used to train the classifier. * * @param instance to be classified. Data field must be a FeatureVector * @return Classification containing the labeling of the instance */ public Classification classify(Instance instance) { // Note that the current size of the label alphabet can be larger // than it was at the time of training. We are careful here // to correctly handle those labels here. For example, // we assume the log prior probability of those classes is // minus infinity. int numClasses = getLabelAlphabet().size(); double[] scores = new double[numClasses]; FeatureVector fv = (FeatureVector) instance.getData(); // Make sure the feature vector's feature dictionary matches // what we are expecting from our data pipe (and thus our notion // of feature probabilities. assert (instancePipe == null || fv.getAlphabet() == instancePipe.getDataAlphabet()); int fvisize = fv.numLocations(); prior.addLogProbabilities(scores); // Set the scores according to the feature weights and per-class probabilities for (int fvi = 0; fvi < fvisize; fvi++) { int fi = fv.indexAtLocation(fvi); for (int ci = 0; ci < numClasses; ci++) { // guard against dataAlphabet or target alphabet growing; can happen if classifying // a never before seen feature. Ignore these. if (ci >= p.length || fi >= p[ci].size()) continue; scores[ci] += fv.valueAtLocation(fvi) * p[ci].logProbability(fi); } } // Get the scores in the range near zero, where exp() is more accurate double maxScore = Double.NEGATIVE_INFINITY; for (int ci = 0; ci < numClasses; ci++) if (scores[ci] > maxScore) maxScore = scores[ci]; for (int ci = 0; ci < numClasses; ci++) scores[ci] -= maxScore; // Exponentiate and normalize double sum = 0; for (int ci = 0; ci < numClasses; ci++) sum += (scores[ci] = Math.exp(scores[ci])); for (int ci = 0; ci < numClasses; ci++) scores[ci] /= sum; // Create and return a Classification object return new Classification(instance, this, new LabelVector(getLabelAlphabet(), scores)); }
private Node getLeaf(Node node, FeatureVector fv) { if (node.getLeftChild() == null && node.getRightChild() == null) return node; else if (fv.value(node.getGainRatio().getMaxValuedIndex()) <= node.getGainRatio().getMaxValuedThreshold()) return getLeaf(node.getLeftChild(), fv); else return getLeaf(node.getRightChild(), fv); }
/** * Command-line wrapper to train, test, or run a generic CRF-based tagger. * * @param args the command line arguments. Options (shell and Java quoting should be added as * needed): * <dl> * <dt><code>--help</code> <em>boolean</em> * <dd>Print this command line option usage information. Give <code>true</code> for longer * documentation. Default is <code>false</code>. * <dt><code>--prefix-code</code> <em>Java-code</em> * <dd>Java code you want run before any other interpreted code. Note that the text is * interpreted without modification, so unlike some other Java code options, you need to * include any necessary 'new's. Default is null. * <dt><code>--gaussian-variance</code> <em>positive-number</em> * <dd>The Gaussian prior variance used for training. Default is 10.0. * <dt><code>--train</code> <em>boolean</em> * <dd>Whether to train. Default is <code>false</code>. * <dt><code>--iterations</code> <em>positive-integer</em> * <dd>Number of training iterations. Default is 500. * <dt><code>--test</code> <code>lab</code> or <code>seg=</code><em>start-1</em><code>. * </code><em>continue-1</em><code>,</code>...<code>,</code><em>start-n</em><code>. * </code><em>continue-n</em> * <dd>Test measuring labeling or segmentation (<em>start-i</em>, <em>continue-i</em>) * accuracy. Default is no testing. * <dt><code>--training-proportion</code> <em>number-between-0-and-1</em> * <dd>Fraction of data to use for training in a random split. Default is 0.5. * <dt><code>--model-file</code> <em>filename</em> * <dd>The filename for reading (train/run) or saving (train) the model. Default is null. * <dt><code>--random-seed</code> <em>integer</em> * <dd>The random seed for randomly selecting a proportion of the instance list for training * Default is 0. * <dt><code>--orders</code> <em>comma-separated-integers</em> * <dd>List of label Markov orders (main and backoff) Default is 1. * <dt><code>--forbidden</code> <em>regular-expression</em> * <dd>If <em>label-1</em><code>,</code><em>label-2</em> matches the expression, the * corresponding transition is forbidden. Default is <code>\\s</code> (nothing * forbidden). * <dt><code>--allowed</code> <em>regular-expression</em> * <dd>If <em>label-1</em><code>,</code><em>label-2</em> does not match the expression, the * corresponding expression is forbidden. Default is <code>.*</code> (everything * allowed). * <dt><code>--default-label</code> <em>string</em> * <dd>Label for initial context and uninteresting tokens. Default is <code>O</code>. * <dt><code>--viterbi-output</code> <em>boolean</em> * <dd>Print Viterbi periodically during training. Default is <code>false</code>. * <dt><code>--fully-connected</code> <em>boolean</em> * <dd>Include all allowed transitions, even those not in training data. Default is <code> * true</code>. * <dt><code>--n-best</code> <em>positive-integer</em> * <dd>Number of answers to output when applying model. Default is 1. * <dt><code>--include-input</code> <em>boolean</em> * <dd>Whether to include input features when printing decoding output. Default is <code> * false</code>. * </dl> * Remaining arguments: * <ul> * <li><em>training-data-file</em> if training * <li><em>training-and-test-data-file</em>, if training and testing with random split * <li><em>training-data-file</em> <em>test-data-file</em> if training and testing from * separate files * <li><em>test-data-file</em> if testing * <li><em>input-data-file</em> if applying to new data (unlabeled) * </ul> * * @exception Exception if an error occurs */ public static void main(String[] args) throws Exception { Reader trainingFile = null, testFile = null; InstanceList trainingData = null, testData = null; int numEvaluations = 0; int iterationsBetweenEvals = 16; int restArgs = commandOptions.processOptions(args); if (restArgs == args.length) { commandOptions.printUsage(true); throw new IllegalArgumentException("Missing data file(s)"); } if (trainOption.value) { trainingFile = new FileReader(new File(args[restArgs])); if (testOption.value != null && restArgs < args.length - 1) testFile = new FileReader(new File(args[restArgs + 1])); } else testFile = new FileReader(new File(args[restArgs])); Pipe p = null; CRF crf = null; TransducerEvaluator eval = null; if (continueTrainingOption.value || !trainOption.value) { if (modelOption.value == null) { commandOptions.printUsage(true); throw new IllegalArgumentException("Missing model file option"); } ObjectInputStream s = new ObjectInputStream(new FileInputStream(modelOption.value)); crf = (CRF) s.readObject(); s.close(); p = crf.getInputPipe(); } else { p = new SimpleTaggerSentence2FeatureVectorSequence(); p.getTargetAlphabet().lookupIndex(defaultOption.value); } if (trainOption.value) { p.setTargetProcessing(true); trainingData = new InstanceList(p); trainingData.addThruPipe( new LineGroupIterator(trainingFile, Pattern.compile("^\\s*$"), true)); logger.info("Number of features in training data: " + p.getDataAlphabet().size()); if (testOption.value != null) { if (testFile != null) { testData = new InstanceList(p); testData.addThruPipe(new LineGroupIterator(testFile, Pattern.compile("^\\s*$"), true)); } else { Random r = new Random(randomSeedOption.value); InstanceList[] trainingLists = trainingData.split( r, new double[] {trainingFractionOption.value, 1 - trainingFractionOption.value}); trainingData = trainingLists[0]; testData = trainingLists[1]; } } } else if (testOption.value != null) { p.setTargetProcessing(true); testData = new InstanceList(p); testData.addThruPipe(new LineGroupIterator(testFile, Pattern.compile("^\\s*$"), true)); } else { p.setTargetProcessing(false); testData = new InstanceList(p); testData.addThruPipe(new LineGroupIterator(testFile, Pattern.compile("^\\s*$"), true)); } logger.info("Number of predicates: " + p.getDataAlphabet().size()); if (testOption.value != null) { if (testOption.value.startsWith("lab")) eval = new TokenAccuracyEvaluator( new InstanceList[] {trainingData, testData}, new String[] {"Training", "Testing"}); else if (testOption.value.startsWith("seg=")) { String[] pairs = testOption.value.substring(4).split(","); if (pairs.length < 1) { commandOptions.printUsage(true); throw new IllegalArgumentException( "Missing segment start/continue labels: " + testOption.value); } String startTags[] = new String[pairs.length]; String continueTags[] = new String[pairs.length]; for (int i = 0; i < pairs.length; i++) { String[] pair = pairs[i].split("\\."); if (pair.length != 2) { commandOptions.printUsage(true); throw new IllegalArgumentException( "Incorrectly-specified segment start and end labels: " + pairs[i]); } startTags[i] = pair[0]; continueTags[i] = pair[1]; } eval = new MultiSegmentationEvaluator( new InstanceList[] {trainingData, testData}, new String[] {"Training", "Testing"}, startTags, continueTags); } else { commandOptions.printUsage(true); throw new IllegalArgumentException("Invalid test option: " + testOption.value); } } if (p.isTargetProcessing()) { Alphabet targets = p.getTargetAlphabet(); StringBuffer buf = new StringBuffer("Labels:"); for (int i = 0; i < targets.size(); i++) buf.append(" ").append(targets.lookupObject(i).toString()); logger.info(buf.toString()); } if (trainOption.value) { crf = train( trainingData, testData, eval, ordersOption.value, defaultOption.value, forbiddenOption.value, allowedOption.value, connectedOption.value, iterationsOption.value, gaussianVarianceOption.value, crf); if (modelOption.value != null) { ObjectOutputStream s = new ObjectOutputStream(new FileOutputStream(modelOption.value)); s.writeObject(crf); s.close(); } } else { if (crf == null) { if (modelOption.value == null) { commandOptions.printUsage(true); throw new IllegalArgumentException("Missing model file option"); } ObjectInputStream s = new ObjectInputStream(new FileInputStream(modelOption.value)); crf = (CRF) s.readObject(); s.close(); } if (eval != null) test(new NoopTransducerTrainer(crf), eval, testData); else { boolean includeInput = includeInputOption.value(); for (int i = 0; i < testData.size(); i++) { Sequence input = (Sequence) testData.get(i).getData(); Sequence[] outputs = apply(crf, input, nBestOption.value); int k = outputs.length; boolean error = false; for (int a = 0; a < k; a++) { if (outputs[a].size() != input.size()) { System.err.println("Failed to decode input sequence " + i + ", answer " + a); error = true; } } if (!error) { for (int j = 0; j < input.size(); j++) { StringBuffer buf = new StringBuffer(); for (int a = 0; a < k; a++) buf.append(outputs[a].get(j).toString()).append(" "); if (includeInput) { FeatureVector fv = (FeatureVector) input.get(j); buf.append(fv.toString(true)); } System.out.println(buf.toString()); } System.out.println(); } } } } }
/** * Process the xml file and output a csv file with the results in the same directory * * @param dataFile the xml file to process * @suffix suffix for identifying the data file * @param suffix * @throws ResourceInitializationException * @throws UIMAException * @throws IOException * @throws AnalysisEngineProcessException * @throws SimilarityException */ private void processEnglishFile(String dataFile, String suffix) throws ResourceInitializationException, UIMAException, IOException, AnalysisEngineProcessException, SimilarityException { /** Parameters for matching tree structures */ String parameterList = Joiner.on(",") .join(new String[] {RichNode.OUTPUT_PAR_LEMMA, RichNode.OUTPUT_PAR_TOKEN_LOWERCASE}); /** Marker which adds relational information to a pair of trees */ MarkTreesOnRepresentation marker = new MarkTreesOnRepresentation(new MarkTwoAncestors()); /** Load stopwords for english */ marker.useStopwords(Stopwords.STOPWORD_EN); /** Tree serializer for converting tree structures to string */ TreeSerializer ts = new TreeSerializer().enableRelationalTags().useRoundBrackets(); /** Instantiate CASes */ JCas questionCas = JCasFactory.createJCas(); JCas commentCas = JCasFactory.createJCas(); WriteFile out = new WriteFile(dataFile + ".csv"); Document doc = Jsoup.parse(new File(dataFile), "UTF-8"); doc.select("QURAN").remove(); doc.select("HADEETH").remove(); boolean firstRow = true; /** Consume data */ Elements questions = doc.getElementsByTag("Question"); int numberOfQuestions = questions.size(); int questionNumber = 1; Map<String, Boolean> commentIsDialogue = new HashMap<>(); for (Element question : questions) { System.out.println("[INFO]: Processing " + questionNumber++ + " out of " + numberOfQuestions); /** Parse question node */ String qid = question.attr("QID"); String qcategory = question.attr("QCATEGORY"); String qdate = question.attr("QDATE"); String quserid = question.attr("QUSERID"); String qtype = question.attr("QTYPE"); String qgold_yn = question.attr("QGOLD_YN"); String qsubject = question.getElementsByTag("QSubject").get(0).text(); String qbody = question.getElementsByTag("QBody").get(0).text(); /** Setup question CAS */ questionCas.reset(); questionCas.setDocumentLanguage("en"); questionCas.setDocumentText(qsubject + ". " + qbody); /** Run the UIMA pipeline */ SimplePipeline.runPipeline(questionCas, this.analysisEngineList); // this.analyzer.analyze(questionCas, new SimpleContent("q-" + qid, qsubject + ". " + qbody)); /** Parse comment nodes */ Elements comments = question.getElementsByTag("Comment"); for (Element comment : comments) { String cid = comment.attr("CID"); String cuserid = comment.attr("CUSERID"); String cgold = comment.attr("CGOLD"); String cgold_yn = comment.attr("CGOLD_YN"); String csubject = comment.getElementsByTag("CSubject").get(0).text(); String cbody = comment.getElementsByTag("CBody").get(0).text(); /** Setup comment CAS */ commentCas.reset(); commentCas.setDocumentLanguage("en"); commentCas.setDocumentText(csubject + ". " + cbody); /** Run the UIMA pipeline */ SimplePipeline.runPipeline(commentCas, this.analysisEngineList); // this.analyzer.analyze(commentCas, new SimpleContent("c-" + cid, csubject + ". " + // cbody)); FeatureVector fv = pfEnglish.getPairFeatures(questionCas, commentCas, parameterList); /** * ************************************* * * * PLUG YOUR FEATURES HERE * * * * * ************************************* */ /** * fv is actually an AugmentableFeatureVector from the Mallet library * * <p>Internally the features are named so you must specify an unique identifier. * * <p>An example: * * <p>((AugmentableFeatureVector) fv).add("your_super_feature_id", 42); * * <p>or: * * <p>AugmentableFeatureVector afv = (AugmentableFeatureVector) fv; * afv.add("your_super_feature_id", 42); */ boolean quseridEqCuserid = quserid.equals(cuserid); if (quseridEqCuserid) { commentIsDialogue.put(cid, true); } // ((AugmentableFeatureVector) fv).add("quseridEqCuserid", quseridEqCuserid); /** * ************************************* * * * THANKS! * * * * * ************************************* */ /** Produce output line */ if (firstRow) { out.write("qid,cgold,cgold_yn"); for (int i = 0; i < fv.numLocations(); i++) { int featureIndex = i + 1; out.write(",f" + featureIndex); } out.write("\n"); firstRow = false; } List<Double> features = this.serializeFv(fv); out.writeLn(cid + "," + cgold + "," + cgold_yn + "," + Joiner.on(",").join(features)); /** Produce also the file needed to train structural models */ if (PRODUCE_SVMLIGHTTK_DATA) { produceSVMLightTKExample( questionCas, commentCas, suffix, ts, qid, cid, cgold, cgold_yn, features); } } } for (String commentId : commentIsDialogue.keySet()) { this.fm.writeLn(dataFile + ".dialogue.txt", commentId); } this.fm.closeFiles(); out.close(); }
public void processArabicFile(Analyzer analyzer, String dataFile, String suffix) throws SimilarityException, UIMAException, IOException { /** We do not have a lemmatizer so we work with tokens */ String parameterList = Joiner.on(",").join(new String[] {RichNode.OUTPUT_PAR_TOKEN_LOWERCASE}); /** Instantiate CASes */ JCas questionCas = JCasFactory.createJCas(); JCas commentCas = JCasFactory.createJCas(); WriteFile out = new WriteFile(dataFile + ".csv"); Document doc = Jsoup.parse(new File(dataFile), "UTF-8"); boolean firstRow = true; /** Consume data */ Elements questions = doc.getElementsByTag("Question"); int numberOfQuestions = questions.size(); int questionNumber = 1; for (Element question : questions) { System.out.println("[INFO]: Processing " + questionNumber++ + " out of " + numberOfQuestions); /** Parse question node */ String qid = question.attr("QID"); String qcategory = question.attr("QCATEGORY"); String qdate = question.attr("QDATE"); String qsubject = question .getElementsByTag("QSubject") .get(0) .text() .replaceAll("/", "") .replaceAll("~", ""); String qbody = question.getElementsByTag("QBody").get(0).text().replaceAll("/", "").replaceAll("~", ""); /** Get analyzed text for question */ if (USE_QCRI_ALT_TOOLS) { questionCas = this.getPreliminarCas(analyzer, questionCas, qid, qsubject + ". " + qbody); } else { questionCas.reset(); questionCas.setDocumentLanguage("ar"); questionCas.setDocumentText(qsubject + ". " + qbody); SimplePipeline.runPipeline(questionCas, this.analysisEngineList); } /** Parse answer nodes */ Elements comments = question.getElementsByTag("Answer"); for (Element comment : comments) { String cid = comment.attr("CID"); String cgold = comment.attr("CGOLD"); String cbody = comment.text().replaceAll("/", "").replaceAll("~", ""); ; /** Get analyzed text for comment */ if (USE_QCRI_ALT_TOOLS) { commentCas = this.getPreliminarCas(analyzer, commentCas, cid, cbody); } else { commentCas.reset(); commentCas.setDocumentLanguage("ar"); commentCas.setDocumentText(cbody); SimplePipeline.runPipeline(commentCas, this.analysisEngineList); } /** Compute features between question and comment */ FeatureVector fv = pfArabic.getPairFeatures(questionCas, commentCas, parameterList); /** * ************************************* * * * PLUG YOUR FEATURES HERE * * * * * ************************************* */ /** * fv is actually an AugmentableFeatureVector from the Mallet library * * <p>Internally the features are named so you must specify an unique identifier. * * <p>An example: * * <p>((AugmentableFeatureVector) fv).add("your_super_feature_id", 42); * * <p>or: * * <p>AugmentableFeatureVector afv = (AugmentableFeatureVector) fv; * afv.add("your_super_feature_id", 42); */ /** * ************************************* * * * THANKS! * * * * * ************************************* */ /** Produce output line */ if (firstRow) { out.write("cid,cgold"); for (int i = 0; i < fv.numLocations(); i++) { int featureIndex = i + 1; out.write(",f" + featureIndex); } out.write("\n"); firstRow = false; } List<Double> features = this.serializeFv(fv); /** Produce output line */ out.writeLn(qid + "-" + cid + "," + cgold + "," + Joiner.on(",").join(features)); } } this.fm.closeFiles(); out.close(); }