public static InstanceList scale(InstanceList trainingList, double lower, double upper) { InstanceList ret = copy(trainingList); Alphabet featDict = ret.getDataAlphabet(); double[] feat_max = new double[featDict.size()]; double[] feat_min = new double[featDict.size()]; for (int i = 0; i < feat_max.length; i++) { feat_max[i] = -Double.MAX_VALUE; feat_min[i] = Double.MAX_VALUE; } for (int i = 0; i < ret.size(); i++) { Instance inst = ret.get(i); FeatureVector fv = (FeatureVector) inst.getData(); for (int loc = 0; loc < fv.numLocations(); loc++) { int featId = fv.indexAtLocation(loc); double value = fv.valueAtLocation(loc); double maxValue = feat_max[featId]; double minValue = feat_min[featId]; double newMaxValue = Math.max(value, maxValue); double newMinValue = Math.min(value, minValue); feat_max[featId] = newMaxValue; feat_min[featId] = newMinValue; } } // double lower = -1; // double upper = 1; for (int i = 0; i < ret.size(); i++) { Instance inst = ret.get(i); FeatureVector fv = (FeatureVector) inst.getData(); for (int loc = 0; loc < fv.numLocations(); loc++) { int featId = fv.indexAtLocation(loc); double value = fv.valueAtLocation(loc); double maxValue = feat_max[featId]; double minValue = feat_min[featId]; double newValue = Double.NaN; if (maxValue == minValue) { newValue = value; } else if (value == minValue) { newValue = lower; } else if (value == maxValue) { newValue = upper; } else { newValue = lower + (upper - lower) * (value - minValue) / (maxValue - minValue); } fv.setValueAtLocation(loc, newValue); } } return ret; }
private double dataLogProbability(Instance instance, int labelIndex) { FeatureVector fv = (FeatureVector) instance.getData(); int fvisize = fv.numLocations(); double logProb = 0; for (int fvi = 0; fvi < fvisize; fvi++) logProb += fv.valueAtLocation(fvi) * p[labelIndex].logProbability(fv.indexAtLocation(fvi)); return logProb; }
public void count() { TIntIntHashMap docCounts = new TIntIntHashMap(); int index = 0; if (instances.size() == 0) { logger.info("Instance list is empty"); return; } if (instances.get(0).getData() instanceof FeatureSequence) { for (Instance instance : instances) { FeatureSequence features = (FeatureSequence) instance.getData(); for (int i = 0; i < features.getLength(); i++) { docCounts.adjustOrPutValue(features.getIndexAtPosition(i), 1, 1); } int[] keys = docCounts.keys(); for (int i = 0; i < keys.length - 1; i++) { int feature = keys[i]; featureCounts[feature] += docCounts.get(feature); documentFrequencies[feature]++; } docCounts = new TIntIntHashMap(); index++; if (index % 1000 == 0) { System.err.println(index); } } } else if (instances.get(0).getData() instanceof FeatureVector) { for (Instance instance : instances) { FeatureVector features = (FeatureVector) instance.getData(); for (int location = 0; location < features.numLocations(); location++) { int feature = features.indexAtLocation(location); double value = features.valueAtLocation(location); documentFrequencies[feature]++; featureCounts[feature] += value; } index++; if (index % 1000 == 0) { System.err.println(index); } } } else { logger.info("Unsupported data class: " + instances.get(0).getData().getClass().getName()); } }
protected List<Double> serializeFv(FeatureVector fv) { List<Double> features = new ArrayList<>(); int numLocations = fv.numLocations(); int[] indices = fv.getIndices(); for (int index = 0; index < numLocations; index++) { int featureIndex = indices[index]; double value = fv.value(featureIndex); features.add(value); } return features; }
public Instance pipe(Instance carrier) { Sequence data = (Sequence) carrier.getData(); Sequence target = (Sequence) carrier.getTarget(); if (data.size() != target.size()) throw new IllegalArgumentException( "Trying to print into SimpleTagger format, where data and target lengths do not match\n" + "data.length = " + data.size() + ", target.length = " + target.size()); int N = data.size(); if (data instanceof TokenSequence) { throw new UnsupportedOperationException("Not yet implemented."); } else if (data instanceof FeatureVectorSequence) { FeatureVectorSequence fvs = (FeatureVectorSequence) data; Alphabet dict = (fvs.size() > 0) ? fvs.getFeatureVector(0).getAlphabet() : null; for (int i = 0; i < N; i++) { Object label = target.get(i); writer.print(label); FeatureVector fv = fvs.getFeatureVector(i); for (int loc = 0; loc < fv.numLocations(); loc++) { writer.print(' '); String fname = dict.lookupObject(fv.indexAtLocation(loc)).toString(); double value = fv.valueAtLocation(loc); // if (!Maths.almostEquals(value, 1.0)) { // throw new IllegalArgumentException ("Printing to SimpleTagger format: FeatureVector // not binary at time slice "+i+" fv:"+fv); // } writer.print(fname + String.valueOf(value)); } writer.println(); } } else { throw new IllegalArgumentException("Don't know how to print data of type " + data); } writer.println(); // writer.print(getDataAlphabet()); return carrier; }
public void addToken(FeatureVector fv) { int[] indices = fv.getIndices(); Alphabet dictionary = fv.getAlphabet(); int indicesLength = fv.numLocations(); for (int i = 0; i < indicesLength; i++) { String key = dictionary.lookupObject(indices[i]).toString(); // log.info( key ); if (!contextOnly || (contextOnly && (key.endsWith("/+1") || key.endsWith("/-1") || key.endsWith("/-2") || key.endsWith("/+2")))) { map.increment(key); } } }
/** * Classify an instance using NaiveBayes according to the trained data. The alphabet of the * featureVector of the instance must match the alphabe of the pipe used to train the classifier. * * @param instance to be classified. Data field must be a FeatureVector * @return Classification containing the labeling of the instance */ public Classification classify(Instance instance) { // Note that the current size of the label alphabet can be larger // than it was at the time of training. We are careful here // to correctly handle those labels here. For example, // we assume the log prior probability of those classes is // minus infinity. int numClasses = getLabelAlphabet().size(); double[] scores = new double[numClasses]; FeatureVector fv = (FeatureVector) instance.getData(); // Make sure the feature vector's feature dictionary matches // what we are expecting from our data pipe (and thus our notion // of feature probabilities. assert (instancePipe == null || fv.getAlphabet() == instancePipe.getDataAlphabet()); int fvisize = fv.numLocations(); prior.addLogProbabilities(scores); // Set the scores according to the feature weights and per-class probabilities for (int fvi = 0; fvi < fvisize; fvi++) { int fi = fv.indexAtLocation(fvi); for (int ci = 0; ci < numClasses; ci++) { // guard against dataAlphabet or target alphabet growing; can happen if classifying // a never before seen feature. Ignore these. if (ci >= p.length || fi >= p[ci].size()) continue; scores[ci] += fv.valueAtLocation(fvi) * p[ci].logProbability(fi); } } // Get the scores in the range near zero, where exp() is more accurate double maxScore = Double.NEGATIVE_INFINITY; for (int ci = 0; ci < numClasses; ci++) if (scores[ci] > maxScore) maxScore = scores[ci]; for (int ci = 0; ci < numClasses; ci++) scores[ci] -= maxScore; // Exponentiate and normalize double sum = 0; for (int ci = 0; ci < numClasses; ci++) sum += (scores[ci] = Math.exp(scores[ci])); for (int ci = 0; ci < numClasses; ci++) scores[ci] /= sum; // Create and return a Classification object return new Classification(instance, this, new LabelVector(getLabelAlphabet(), scores)); }
/** * Process the xml file and output a csv file with the results in the same directory * * @param dataFile the xml file to process * @suffix suffix for identifying the data file * @param suffix * @throws ResourceInitializationException * @throws UIMAException * @throws IOException * @throws AnalysisEngineProcessException * @throws SimilarityException */ private void processEnglishFile(String dataFile, String suffix) throws ResourceInitializationException, UIMAException, IOException, AnalysisEngineProcessException, SimilarityException { /** Parameters for matching tree structures */ String parameterList = Joiner.on(",") .join(new String[] {RichNode.OUTPUT_PAR_LEMMA, RichNode.OUTPUT_PAR_TOKEN_LOWERCASE}); /** Marker which adds relational information to a pair of trees */ MarkTreesOnRepresentation marker = new MarkTreesOnRepresentation(new MarkTwoAncestors()); /** Load stopwords for english */ marker.useStopwords(Stopwords.STOPWORD_EN); /** Tree serializer for converting tree structures to string */ TreeSerializer ts = new TreeSerializer().enableRelationalTags().useRoundBrackets(); /** Instantiate CASes */ JCas questionCas = JCasFactory.createJCas(); JCas commentCas = JCasFactory.createJCas(); WriteFile out = new WriteFile(dataFile + ".csv"); Document doc = Jsoup.parse(new File(dataFile), "UTF-8"); doc.select("QURAN").remove(); doc.select("HADEETH").remove(); boolean firstRow = true; /** Consume data */ Elements questions = doc.getElementsByTag("Question"); int numberOfQuestions = questions.size(); int questionNumber = 1; Map<String, Boolean> commentIsDialogue = new HashMap<>(); for (Element question : questions) { System.out.println("[INFO]: Processing " + questionNumber++ + " out of " + numberOfQuestions); /** Parse question node */ String qid = question.attr("QID"); String qcategory = question.attr("QCATEGORY"); String qdate = question.attr("QDATE"); String quserid = question.attr("QUSERID"); String qtype = question.attr("QTYPE"); String qgold_yn = question.attr("QGOLD_YN"); String qsubject = question.getElementsByTag("QSubject").get(0).text(); String qbody = question.getElementsByTag("QBody").get(0).text(); /** Setup question CAS */ questionCas.reset(); questionCas.setDocumentLanguage("en"); questionCas.setDocumentText(qsubject + ". " + qbody); /** Run the UIMA pipeline */ SimplePipeline.runPipeline(questionCas, this.analysisEngineList); // this.analyzer.analyze(questionCas, new SimpleContent("q-" + qid, qsubject + ". " + qbody)); /** Parse comment nodes */ Elements comments = question.getElementsByTag("Comment"); for (Element comment : comments) { String cid = comment.attr("CID"); String cuserid = comment.attr("CUSERID"); String cgold = comment.attr("CGOLD"); String cgold_yn = comment.attr("CGOLD_YN"); String csubject = comment.getElementsByTag("CSubject").get(0).text(); String cbody = comment.getElementsByTag("CBody").get(0).text(); /** Setup comment CAS */ commentCas.reset(); commentCas.setDocumentLanguage("en"); commentCas.setDocumentText(csubject + ". " + cbody); /** Run the UIMA pipeline */ SimplePipeline.runPipeline(commentCas, this.analysisEngineList); // this.analyzer.analyze(commentCas, new SimpleContent("c-" + cid, csubject + ". " + // cbody)); FeatureVector fv = pfEnglish.getPairFeatures(questionCas, commentCas, parameterList); /** * ************************************* * * * PLUG YOUR FEATURES HERE * * * * * ************************************* */ /** * fv is actually an AugmentableFeatureVector from the Mallet library * * <p>Internally the features are named so you must specify an unique identifier. * * <p>An example: * * <p>((AugmentableFeatureVector) fv).add("your_super_feature_id", 42); * * <p>or: * * <p>AugmentableFeatureVector afv = (AugmentableFeatureVector) fv; * afv.add("your_super_feature_id", 42); */ boolean quseridEqCuserid = quserid.equals(cuserid); if (quseridEqCuserid) { commentIsDialogue.put(cid, true); } // ((AugmentableFeatureVector) fv).add("quseridEqCuserid", quseridEqCuserid); /** * ************************************* * * * THANKS! * * * * * ************************************* */ /** Produce output line */ if (firstRow) { out.write("qid,cgold,cgold_yn"); for (int i = 0; i < fv.numLocations(); i++) { int featureIndex = i + 1; out.write(",f" + featureIndex); } out.write("\n"); firstRow = false; } List<Double> features = this.serializeFv(fv); out.writeLn(cid + "," + cgold + "," + cgold_yn + "," + Joiner.on(",").join(features)); /** Produce also the file needed to train structural models */ if (PRODUCE_SVMLIGHTTK_DATA) { produceSVMLightTKExample( questionCas, commentCas, suffix, ts, qid, cid, cgold, cgold_yn, features); } } } for (String commentId : commentIsDialogue.keySet()) { this.fm.writeLn(dataFile + ".dialogue.txt", commentId); } this.fm.closeFiles(); out.close(); }
public void processArabicFile(Analyzer analyzer, String dataFile, String suffix) throws SimilarityException, UIMAException, IOException { /** We do not have a lemmatizer so we work with tokens */ String parameterList = Joiner.on(",").join(new String[] {RichNode.OUTPUT_PAR_TOKEN_LOWERCASE}); /** Instantiate CASes */ JCas questionCas = JCasFactory.createJCas(); JCas commentCas = JCasFactory.createJCas(); WriteFile out = new WriteFile(dataFile + ".csv"); Document doc = Jsoup.parse(new File(dataFile), "UTF-8"); boolean firstRow = true; /** Consume data */ Elements questions = doc.getElementsByTag("Question"); int numberOfQuestions = questions.size(); int questionNumber = 1; for (Element question : questions) { System.out.println("[INFO]: Processing " + questionNumber++ + " out of " + numberOfQuestions); /** Parse question node */ String qid = question.attr("QID"); String qcategory = question.attr("QCATEGORY"); String qdate = question.attr("QDATE"); String qsubject = question .getElementsByTag("QSubject") .get(0) .text() .replaceAll("/", "") .replaceAll("~", ""); String qbody = question.getElementsByTag("QBody").get(0).text().replaceAll("/", "").replaceAll("~", ""); /** Get analyzed text for question */ if (USE_QCRI_ALT_TOOLS) { questionCas = this.getPreliminarCas(analyzer, questionCas, qid, qsubject + ". " + qbody); } else { questionCas.reset(); questionCas.setDocumentLanguage("ar"); questionCas.setDocumentText(qsubject + ". " + qbody); SimplePipeline.runPipeline(questionCas, this.analysisEngineList); } /** Parse answer nodes */ Elements comments = question.getElementsByTag("Answer"); for (Element comment : comments) { String cid = comment.attr("CID"); String cgold = comment.attr("CGOLD"); String cbody = comment.text().replaceAll("/", "").replaceAll("~", ""); ; /** Get analyzed text for comment */ if (USE_QCRI_ALT_TOOLS) { commentCas = this.getPreliminarCas(analyzer, commentCas, cid, cbody); } else { commentCas.reset(); commentCas.setDocumentLanguage("ar"); commentCas.setDocumentText(cbody); SimplePipeline.runPipeline(commentCas, this.analysisEngineList); } /** Compute features between question and comment */ FeatureVector fv = pfArabic.getPairFeatures(questionCas, commentCas, parameterList); /** * ************************************* * * * PLUG YOUR FEATURES HERE * * * * * ************************************* */ /** * fv is actually an AugmentableFeatureVector from the Mallet library * * <p>Internally the features are named so you must specify an unique identifier. * * <p>An example: * * <p>((AugmentableFeatureVector) fv).add("your_super_feature_id", 42); * * <p>or: * * <p>AugmentableFeatureVector afv = (AugmentableFeatureVector) fv; * afv.add("your_super_feature_id", 42); */ /** * ************************************* * * * THANKS! * * * * * ************************************* */ /** Produce output line */ if (firstRow) { out.write("cid,cgold"); for (int i = 0; i < fv.numLocations(); i++) { int featureIndex = i + 1; out.write(",f" + featureIndex); } out.write("\n"); firstRow = false; } List<Double> features = this.serializeFv(fv); /** Produce output line */ out.writeLn(qid + "-" + cid + "," + cgold + "," + Joiner.on(",").join(features)); } } this.fm.closeFiles(); out.close(); }