Example #1
0
File: C45.java Project: alei76/tctm
  public Classification classify(Instance instance) {
    FeatureVector fv = (FeatureVector) instance.getData();
    assert (instancePipe == null || fv.getAlphabet() == this.instancePipe.getDataAlphabet());

    Node leaf = getLeaf(m_root, fv);
    return new Classification(instance, this, leaf.getGainRatio().getBaseLabelDistribution());
  }
  public static InstanceList copy(InstanceList instances) {
    InstanceList ret = (InstanceList) instances.clone();
    // LabelAlphabet labelDict = (LabelAlphabet) ret.getTargetAlphabet();
    Alphabet featDict = ret.getDataAlphabet();

    for (int i = 0; i < ret.size(); i++) {
      Instance instance = ret.get(i);
      Instance clone = (Instance) instance.clone();
      FeatureVector fv = (FeatureVector) clone.getData();

      int[] indices = fv.getIndices();
      double[] values = fv.getValues();

      int[] newIndices = new int[indices.length];
      System.arraycopy(indices, 0, newIndices, 0, indices.length);

      double[] newValues = new double[indices.length];
      System.arraycopy(values, 0, newValues, 0, indices.length);

      FeatureVector newFv = new FeatureVector(featDict, newIndices, newValues);
      Instance newInstance =
          new Instance(newFv, instance.getTarget(), instance.getName(), instance.getSource());
      ret.set(i, newInstance);
    }

    return ret;
  }
Example #3
0
File: C45.java Project: alei76/tctm
 public void split() {
   if (m_ilist == null) throw new IllegalStateException("Frozen.  Cannot split.");
   int numLeftChildren = 0;
   boolean[] toLeftChild = new boolean[m_instIndices.length];
   for (int i = 0; i < m_instIndices.length; i++) {
     Instance instance = m_ilist.get(m_instIndices[i]);
     FeatureVector fv = (FeatureVector) instance.getData();
     if (fv.value(m_gainRatio.getMaxValuedIndex()) <= m_gainRatio.getMaxValuedThreshold()) {
       toLeftChild[i] = true;
       numLeftChildren++;
     } else toLeftChild[i] = false;
   }
   logger.info(
       "leftChild.size="
           + numLeftChildren
           + " rightChild.size="
           + (m_instIndices.length - numLeftChildren));
   int[] leftIndices = new int[numLeftChildren];
   int[] rightIndices = new int[m_instIndices.length - numLeftChildren];
   int li = 0, ri = 0;
   for (int i = 0; i < m_instIndices.length; i++) {
     if (toLeftChild[i]) leftIndices[li++] = m_instIndices[i];
     else rightIndices[ri++] = m_instIndices[i];
   }
   m_leftChild = new Node(m_ilist, this, m_minNumInsts, leftIndices);
   m_rightChild = new Node(m_ilist, this, m_minNumInsts, rightIndices);
 }
Example #4
0
  private double dataLogProbability(Instance instance, int labelIndex) {
    FeatureVector fv = (FeatureVector) instance.getData();
    int fvisize = fv.numLocations();
    double logProb = 0;

    for (int fvi = 0; fvi < fvisize; fvi++)
      logProb += fv.valueAtLocation(fvi) * p[labelIndex].logProbability(fv.indexAtLocation(fvi));
    return logProb;
  }
Example #5
0
  public void count() {

    TIntIntHashMap docCounts = new TIntIntHashMap();

    int index = 0;

    if (instances.size() == 0) {
      logger.info("Instance list is empty");
      return;
    }

    if (instances.get(0).getData() instanceof FeatureSequence) {

      for (Instance instance : instances) {
        FeatureSequence features = (FeatureSequence) instance.getData();

        for (int i = 0; i < features.getLength(); i++) {
          docCounts.adjustOrPutValue(features.getIndexAtPosition(i), 1, 1);
        }

        int[] keys = docCounts.keys();
        for (int i = 0; i < keys.length - 1; i++) {
          int feature = keys[i];
          featureCounts[feature] += docCounts.get(feature);
          documentFrequencies[feature]++;
        }

        docCounts = new TIntIntHashMap();

        index++;
        if (index % 1000 == 0) {
          System.err.println(index);
        }
      }
    } else if (instances.get(0).getData() instanceof FeatureVector) {

      for (Instance instance : instances) {
        FeatureVector features = (FeatureVector) instance.getData();

        for (int location = 0; location < features.numLocations(); location++) {
          int feature = features.indexAtLocation(location);
          double value = features.valueAtLocation(location);

          documentFrequencies[feature]++;
          featureCounts[feature] += value;
        }

        index++;
        if (index % 1000 == 0) {
          System.err.println(index);
        }
      }
    } else {
      logger.info("Unsupported data class: " + instances.get(0).getData().getClass().getName());
    }
  }
  public static InstanceList scale(InstanceList trainingList, double lower, double upper) {
    InstanceList ret = copy(trainingList);
    Alphabet featDict = ret.getDataAlphabet();

    double[] feat_max = new double[featDict.size()];
    double[] feat_min = new double[featDict.size()];

    for (int i = 0; i < feat_max.length; i++) {
      feat_max[i] = -Double.MAX_VALUE;
      feat_min[i] = Double.MAX_VALUE;
    }

    for (int i = 0; i < ret.size(); i++) {
      Instance inst = ret.get(i);
      FeatureVector fv = (FeatureVector) inst.getData();

      for (int loc = 0; loc < fv.numLocations(); loc++) {
        int featId = fv.indexAtLocation(loc);
        double value = fv.valueAtLocation(loc);
        double maxValue = feat_max[featId];
        double minValue = feat_min[featId];

        double newMaxValue = Math.max(value, maxValue);
        double newMinValue = Math.min(value, minValue);

        feat_max[featId] = newMaxValue;
        feat_min[featId] = newMinValue;
      }
    }

    // double lower = -1;
    // double upper = 1;

    for (int i = 0; i < ret.size(); i++) {
      Instance inst = ret.get(i);
      FeatureVector fv = (FeatureVector) inst.getData();

      for (int loc = 0; loc < fv.numLocations(); loc++) {
        int featId = fv.indexAtLocation(loc);
        double value = fv.valueAtLocation(loc);
        double maxValue = feat_max[featId];
        double minValue = feat_min[featId];
        double newValue = Double.NaN;
        if (maxValue == minValue) {
          newValue = value;
        } else if (value == minValue) {
          newValue = lower;
        } else if (value == maxValue) {
          newValue = upper;
        } else {
          newValue = lower + (upper - lower) * (value - minValue) / (maxValue - minValue);
        }

        fv.setValueAtLocation(loc, newValue);
      }
    }

    return ret;
  }
 protected List<Double> serializeFv(FeatureVector fv) {
   List<Double> features = new ArrayList<>();
   int numLocations = fv.numLocations();
   int[] indices = fv.getIndices();
   for (int index = 0; index < numLocations; index++) {
     int featureIndex = indices[index];
     double value = fv.value(featureIndex);
     features.add(value);
   }
   return features;
 }
  public Instance pipe(Instance carrier) {
    Sequence data = (Sequence) carrier.getData();
    Sequence target = (Sequence) carrier.getTarget();

    if (data.size() != target.size())
      throw new IllegalArgumentException(
          "Trying to print into SimpleTagger format, where data and target lengths do not match\n"
              + "data.length = "
              + data.size()
              + ", target.length = "
              + target.size());

    int N = data.size();

    if (data instanceof TokenSequence) {
      throw new UnsupportedOperationException("Not yet implemented.");
    } else if (data instanceof FeatureVectorSequence) {

      FeatureVectorSequence fvs = (FeatureVectorSequence) data;
      Alphabet dict = (fvs.size() > 0) ? fvs.getFeatureVector(0).getAlphabet() : null;

      for (int i = 0; i < N; i++) {
        Object label = target.get(i);
        writer.print(label);

        FeatureVector fv = fvs.getFeatureVector(i);
        for (int loc = 0; loc < fv.numLocations(); loc++) {
          writer.print(' ');
          String fname = dict.lookupObject(fv.indexAtLocation(loc)).toString();
          double value = fv.valueAtLocation(loc);
          // if (!Maths.almostEquals(value, 1.0)) {
          //    throw new IllegalArgumentException ("Printing to SimpleTagger format: FeatureVector
          // not binary at time slice "+i+" fv:"+fv);
          // }
          writer.print(fname + String.valueOf(value));
        }
        writer.println();
      }
    } else {
      throw new IllegalArgumentException("Don't know how to print data of type " + data);
    }

    writer.println();
    // writer.print(getDataAlphabet());

    return carrier;
  }
Example #9
0
 public void addToken(FeatureVector fv) {
   int[] indices = fv.getIndices();
   Alphabet dictionary = fv.getAlphabet();
   int indicesLength = fv.numLocations();
   for (int i = 0; i < indicesLength; i++) {
     String key = dictionary.lookupObject(indices[i]).toString();
     // log.info( key );
     if (!contextOnly
         || (contextOnly
             && (key.endsWith("/+1")
                 || key.endsWith("/-1")
                 || key.endsWith("/-2")
                 || key.endsWith("/+2")))) {
       map.increment(key);
     }
   }
 }
Example #10
0
  /**
   * Classify an instance using NaiveBayes according to the trained data. The alphabet of the
   * featureVector of the instance must match the alphabe of the pipe used to train the classifier.
   *
   * @param instance to be classified. Data field must be a FeatureVector
   * @return Classification containing the labeling of the instance
   */
  public Classification classify(Instance instance) {
    // Note that the current size of the label alphabet can be larger
    // than it was at the time of training.  We are careful here
    // to correctly handle those labels here. For example,
    // we assume the log prior probability of those classes is
    // minus infinity.
    int numClasses = getLabelAlphabet().size();
    double[] scores = new double[numClasses];
    FeatureVector fv = (FeatureVector) instance.getData();
    // Make sure the feature vector's feature dictionary matches
    // what we are expecting from our data pipe (and thus our notion
    // of feature probabilities.
    assert (instancePipe == null || fv.getAlphabet() == instancePipe.getDataAlphabet());
    int fvisize = fv.numLocations();

    prior.addLogProbabilities(scores);

    // Set the scores according to the feature weights and per-class probabilities
    for (int fvi = 0; fvi < fvisize; fvi++) {
      int fi = fv.indexAtLocation(fvi);
      for (int ci = 0; ci < numClasses; ci++) {
        // guard against dataAlphabet or target alphabet growing; can happen if classifying
        // a never before seen feature.  Ignore these.
        if (ci >= p.length || fi >= p[ci].size()) continue;

        scores[ci] += fv.valueAtLocation(fvi) * p[ci].logProbability(fi);
      }
    }

    // Get the scores in the range near zero, where exp() is more accurate
    double maxScore = Double.NEGATIVE_INFINITY;
    for (int ci = 0; ci < numClasses; ci++) if (scores[ci] > maxScore) maxScore = scores[ci];
    for (int ci = 0; ci < numClasses; ci++) scores[ci] -= maxScore;

    // Exponentiate and normalize
    double sum = 0;
    for (int ci = 0; ci < numClasses; ci++) sum += (scores[ci] = Math.exp(scores[ci]));
    for (int ci = 0; ci < numClasses; ci++) scores[ci] /= sum;

    // Create and return a Classification object
    return new Classification(instance, this, new LabelVector(getLabelAlphabet(), scores));
  }
Example #11
0
File: C45.java Project: alei76/tctm
 private Node getLeaf(Node node, FeatureVector fv) {
   if (node.getLeftChild() == null && node.getRightChild() == null) return node;
   else if (fv.value(node.getGainRatio().getMaxValuedIndex())
       <= node.getGainRatio().getMaxValuedThreshold()) return getLeaf(node.getLeftChild(), fv);
   else return getLeaf(node.getRightChild(), fv);
 }
Example #12
0
  /**
   * Command-line wrapper to train, test, or run a generic CRF-based tagger.
   *
   * @param args the command line arguments. Options (shell and Java quoting should be added as
   *     needed):
   *     <dl>
   *       <dt><code>--help</code> <em>boolean</em>
   *       <dd>Print this command line option usage information. Give <code>true</code> for longer
   *           documentation. Default is <code>false</code>.
   *       <dt><code>--prefix-code</code> <em>Java-code</em>
   *       <dd>Java code you want run before any other interpreted code. Note that the text is
   *           interpreted without modification, so unlike some other Java code options, you need to
   *           include any necessary 'new's. Default is null.
   *       <dt><code>--gaussian-variance</code> <em>positive-number</em>
   *       <dd>The Gaussian prior variance used for training. Default is 10.0.
   *       <dt><code>--train</code> <em>boolean</em>
   *       <dd>Whether to train. Default is <code>false</code>.
   *       <dt><code>--iterations</code> <em>positive-integer</em>
   *       <dd>Number of training iterations. Default is 500.
   *       <dt><code>--test</code> <code>lab</code> or <code>seg=</code><em>start-1</em><code>.
   *           </code><em>continue-1</em><code>,</code>...<code>,</code><em>start-n</em><code>.
   *           </code><em>continue-n</em>
   *       <dd>Test measuring labeling or segmentation (<em>start-i</em>, <em>continue-i</em>)
   *           accuracy. Default is no testing.
   *       <dt><code>--training-proportion</code> <em>number-between-0-and-1</em>
   *       <dd>Fraction of data to use for training in a random split. Default is 0.5.
   *       <dt><code>--model-file</code> <em>filename</em>
   *       <dd>The filename for reading (train/run) or saving (train) the model. Default is null.
   *       <dt><code>--random-seed</code> <em>integer</em>
   *       <dd>The random seed for randomly selecting a proportion of the instance list for training
   *           Default is 0.
   *       <dt><code>--orders</code> <em>comma-separated-integers</em>
   *       <dd>List of label Markov orders (main and backoff) Default is 1.
   *       <dt><code>--forbidden</code> <em>regular-expression</em>
   *       <dd>If <em>label-1</em><code>,</code><em>label-2</em> matches the expression, the
   *           corresponding transition is forbidden. Default is <code>\\s</code> (nothing
   *           forbidden).
   *       <dt><code>--allowed</code> <em>regular-expression</em>
   *       <dd>If <em>label-1</em><code>,</code><em>label-2</em> does not match the expression, the
   *           corresponding expression is forbidden. Default is <code>.*</code> (everything
   *           allowed).
   *       <dt><code>--default-label</code> <em>string</em>
   *       <dd>Label for initial context and uninteresting tokens. Default is <code>O</code>.
   *       <dt><code>--viterbi-output</code> <em>boolean</em>
   *       <dd>Print Viterbi periodically during training. Default is <code>false</code>.
   *       <dt><code>--fully-connected</code> <em>boolean</em>
   *       <dd>Include all allowed transitions, even those not in training data. Default is <code>
   *           true</code>.
   *       <dt><code>--n-best</code> <em>positive-integer</em>
   *       <dd>Number of answers to output when applying model. Default is 1.
   *       <dt><code>--include-input</code> <em>boolean</em>
   *       <dd>Whether to include input features when printing decoding output. Default is <code>
   *           false</code>.
   *     </dl>
   *     Remaining arguments:
   *     <ul>
   *       <li><em>training-data-file</em> if training
   *       <li><em>training-and-test-data-file</em>, if training and testing with random split
   *       <li><em>training-data-file</em> <em>test-data-file</em> if training and testing from
   *           separate files
   *       <li><em>test-data-file</em> if testing
   *       <li><em>input-data-file</em> if applying to new data (unlabeled)
   *     </ul>
   *
   * @exception Exception if an error occurs
   */
  public static void main(String[] args) throws Exception {
    Reader trainingFile = null, testFile = null;
    InstanceList trainingData = null, testData = null;
    int numEvaluations = 0;
    int iterationsBetweenEvals = 16;
    int restArgs = commandOptions.processOptions(args);
    if (restArgs == args.length) {
      commandOptions.printUsage(true);
      throw new IllegalArgumentException("Missing data file(s)");
    }
    if (trainOption.value) {
      trainingFile = new FileReader(new File(args[restArgs]));
      if (testOption.value != null && restArgs < args.length - 1)
        testFile = new FileReader(new File(args[restArgs + 1]));
    } else testFile = new FileReader(new File(args[restArgs]));

    Pipe p = null;
    CRF crf = null;
    TransducerEvaluator eval = null;
    if (continueTrainingOption.value || !trainOption.value) {
      if (modelOption.value == null) {
        commandOptions.printUsage(true);
        throw new IllegalArgumentException("Missing model file option");
      }
      ObjectInputStream s = new ObjectInputStream(new FileInputStream(modelOption.value));
      crf = (CRF) s.readObject();
      s.close();
      p = crf.getInputPipe();
    } else {
      p = new SimpleTaggerSentence2FeatureVectorSequence();
      p.getTargetAlphabet().lookupIndex(defaultOption.value);
    }

    if (trainOption.value) {
      p.setTargetProcessing(true);
      trainingData = new InstanceList(p);
      trainingData.addThruPipe(
          new LineGroupIterator(trainingFile, Pattern.compile("^\\s*$"), true));
      logger.info("Number of features in training data: " + p.getDataAlphabet().size());
      if (testOption.value != null) {
        if (testFile != null) {
          testData = new InstanceList(p);
          testData.addThruPipe(new LineGroupIterator(testFile, Pattern.compile("^\\s*$"), true));
        } else {
          Random r = new Random(randomSeedOption.value);
          InstanceList[] trainingLists =
              trainingData.split(
                  r, new double[] {trainingFractionOption.value, 1 - trainingFractionOption.value});
          trainingData = trainingLists[0];
          testData = trainingLists[1];
        }
      }
    } else if (testOption.value != null) {
      p.setTargetProcessing(true);
      testData = new InstanceList(p);
      testData.addThruPipe(new LineGroupIterator(testFile, Pattern.compile("^\\s*$"), true));
    } else {
      p.setTargetProcessing(false);
      testData = new InstanceList(p);
      testData.addThruPipe(new LineGroupIterator(testFile, Pattern.compile("^\\s*$"), true));
    }
    logger.info("Number of predicates: " + p.getDataAlphabet().size());

    if (testOption.value != null) {
      if (testOption.value.startsWith("lab"))
        eval =
            new TokenAccuracyEvaluator(
                new InstanceList[] {trainingData, testData}, new String[] {"Training", "Testing"});
      else if (testOption.value.startsWith("seg=")) {
        String[] pairs = testOption.value.substring(4).split(",");
        if (pairs.length < 1) {
          commandOptions.printUsage(true);
          throw new IllegalArgumentException(
              "Missing segment start/continue labels: " + testOption.value);
        }
        String startTags[] = new String[pairs.length];
        String continueTags[] = new String[pairs.length];
        for (int i = 0; i < pairs.length; i++) {
          String[] pair = pairs[i].split("\\.");
          if (pair.length != 2) {
            commandOptions.printUsage(true);
            throw new IllegalArgumentException(
                "Incorrectly-specified segment start and end labels: " + pairs[i]);
          }
          startTags[i] = pair[0];
          continueTags[i] = pair[1];
        }
        eval =
            new MultiSegmentationEvaluator(
                new InstanceList[] {trainingData, testData},
                new String[] {"Training", "Testing"},
                startTags,
                continueTags);
      } else {
        commandOptions.printUsage(true);
        throw new IllegalArgumentException("Invalid test option: " + testOption.value);
      }
    }

    if (p.isTargetProcessing()) {
      Alphabet targets = p.getTargetAlphabet();
      StringBuffer buf = new StringBuffer("Labels:");
      for (int i = 0; i < targets.size(); i++)
        buf.append(" ").append(targets.lookupObject(i).toString());
      logger.info(buf.toString());
    }
    if (trainOption.value) {
      crf =
          train(
              trainingData,
              testData,
              eval,
              ordersOption.value,
              defaultOption.value,
              forbiddenOption.value,
              allowedOption.value,
              connectedOption.value,
              iterationsOption.value,
              gaussianVarianceOption.value,
              crf);
      if (modelOption.value != null) {
        ObjectOutputStream s = new ObjectOutputStream(new FileOutputStream(modelOption.value));
        s.writeObject(crf);
        s.close();
      }
    } else {
      if (crf == null) {
        if (modelOption.value == null) {
          commandOptions.printUsage(true);
          throw new IllegalArgumentException("Missing model file option");
        }
        ObjectInputStream s = new ObjectInputStream(new FileInputStream(modelOption.value));
        crf = (CRF) s.readObject();
        s.close();
      }
      if (eval != null) test(new NoopTransducerTrainer(crf), eval, testData);
      else {
        boolean includeInput = includeInputOption.value();
        for (int i = 0; i < testData.size(); i++) {
          Sequence input = (Sequence) testData.get(i).getData();
          Sequence[] outputs = apply(crf, input, nBestOption.value);
          int k = outputs.length;
          boolean error = false;
          for (int a = 0; a < k; a++) {
            if (outputs[a].size() != input.size()) {
              System.err.println("Failed to decode input sequence " + i + ", answer " + a);
              error = true;
            }
          }
          if (!error) {
            for (int j = 0; j < input.size(); j++) {
              StringBuffer buf = new StringBuffer();
              for (int a = 0; a < k; a++) buf.append(outputs[a].get(j).toString()).append(" ");
              if (includeInput) {
                FeatureVector fv = (FeatureVector) input.get(j);
                buf.append(fv.toString(true));
              }
              System.out.println(buf.toString());
            }
            System.out.println();
          }
        }
      }
    }
  }
Example #13
0
  /**
   * Process the xml file and output a csv file with the results in the same directory
   *
   * @param dataFile the xml file to process
   * @suffix suffix for identifying the data file
   * @param suffix
   * @throws ResourceInitializationException
   * @throws UIMAException
   * @throws IOException
   * @throws AnalysisEngineProcessException
   * @throws SimilarityException
   */
  private void processEnglishFile(String dataFile, String suffix)
      throws ResourceInitializationException, UIMAException, IOException,
          AnalysisEngineProcessException, SimilarityException {

    /** Parameters for matching tree structures */
    String parameterList =
        Joiner.on(",")
            .join(new String[] {RichNode.OUTPUT_PAR_LEMMA, RichNode.OUTPUT_PAR_TOKEN_LOWERCASE});

    /** Marker which adds relational information to a pair of trees */
    MarkTreesOnRepresentation marker = new MarkTreesOnRepresentation(new MarkTwoAncestors());

    /** Load stopwords for english */
    marker.useStopwords(Stopwords.STOPWORD_EN);

    /** Tree serializer for converting tree structures to string */
    TreeSerializer ts = new TreeSerializer().enableRelationalTags().useRoundBrackets();

    /** Instantiate CASes */
    JCas questionCas = JCasFactory.createJCas();
    JCas commentCas = JCasFactory.createJCas();

    WriteFile out = new WriteFile(dataFile + ".csv");

    Document doc = Jsoup.parse(new File(dataFile), "UTF-8");

    doc.select("QURAN").remove();
    doc.select("HADEETH").remove();

    boolean firstRow = true;

    /** Consume data */
    Elements questions = doc.getElementsByTag("Question");
    int numberOfQuestions = questions.size();
    int questionNumber = 1;

    Map<String, Boolean> commentIsDialogue = new HashMap<>();

    for (Element question : questions) {
      System.out.println("[INFO]: Processing " + questionNumber++ + " out of " + numberOfQuestions);
      /** Parse question node */
      String qid = question.attr("QID");
      String qcategory = question.attr("QCATEGORY");
      String qdate = question.attr("QDATE");
      String quserid = question.attr("QUSERID");
      String qtype = question.attr("QTYPE");
      String qgold_yn = question.attr("QGOLD_YN");
      String qsubject = question.getElementsByTag("QSubject").get(0).text();
      String qbody = question.getElementsByTag("QBody").get(0).text();

      /** Setup question CAS */
      questionCas.reset();
      questionCas.setDocumentLanguage("en");
      questionCas.setDocumentText(qsubject + ". " + qbody);

      /** Run the UIMA pipeline */
      SimplePipeline.runPipeline(questionCas, this.analysisEngineList);

      // this.analyzer.analyze(questionCas, new SimpleContent("q-" + qid, qsubject + ". " + qbody));

      /** Parse comment nodes */
      Elements comments = question.getElementsByTag("Comment");
      for (Element comment : comments) {
        String cid = comment.attr("CID");
        String cuserid = comment.attr("CUSERID");
        String cgold = comment.attr("CGOLD");
        String cgold_yn = comment.attr("CGOLD_YN");
        String csubject = comment.getElementsByTag("CSubject").get(0).text();
        String cbody = comment.getElementsByTag("CBody").get(0).text();

        /** Setup comment CAS */
        commentCas.reset();
        commentCas.setDocumentLanguage("en");
        commentCas.setDocumentText(csubject + ". " + cbody);

        /** Run the UIMA pipeline */
        SimplePipeline.runPipeline(commentCas, this.analysisEngineList);

        // this.analyzer.analyze(commentCas, new SimpleContent("c-" + cid, csubject + ". " +
        // cbody));

        FeatureVector fv = pfEnglish.getPairFeatures(questionCas, commentCas, parameterList);

        /**
         * ************************************* * * * PLUG YOUR FEATURES HERE * * * *
         * *************************************
         */

        /**
         * fv is actually an AugmentableFeatureVector from the Mallet library
         *
         * <p>Internally the features are named so you must specify an unique identifier.
         *
         * <p>An example:
         *
         * <p>((AugmentableFeatureVector) fv).add("your_super_feature_id", 42);
         *
         * <p>or:
         *
         * <p>AugmentableFeatureVector afv = (AugmentableFeatureVector) fv;
         * afv.add("your_super_feature_id", 42);
         */
        boolean quseridEqCuserid = quserid.equals(cuserid);
        if (quseridEqCuserid) {
          commentIsDialogue.put(cid, true);
        }

        // ((AugmentableFeatureVector) fv).add("quseridEqCuserid", quseridEqCuserid);

        /**
         * ************************************* * * * THANKS! * * * *
         * *************************************
         */

        /** Produce output line */
        if (firstRow) {
          out.write("qid,cgold,cgold_yn");
          for (int i = 0; i < fv.numLocations(); i++) {
            int featureIndex = i + 1;
            out.write(",f" + featureIndex);
          }
          out.write("\n");

          firstRow = false;
        }

        List<Double> features = this.serializeFv(fv);

        out.writeLn(cid + "," + cgold + "," + cgold_yn + "," + Joiner.on(",").join(features));

        /** Produce also the file needed to train structural models */
        if (PRODUCE_SVMLIGHTTK_DATA) {
          produceSVMLightTKExample(
              questionCas, commentCas, suffix, ts, qid, cid, cgold, cgold_yn, features);
        }
      }
    }

    for (String commentId : commentIsDialogue.keySet()) {
      this.fm.writeLn(dataFile + ".dialogue.txt", commentId);
    }

    this.fm.closeFiles();
    out.close();
  }
Example #14
0
  public void processArabicFile(Analyzer analyzer, String dataFile, String suffix)
      throws SimilarityException, UIMAException, IOException {
    /** We do not have a lemmatizer so we work with tokens */
    String parameterList = Joiner.on(",").join(new String[] {RichNode.OUTPUT_PAR_TOKEN_LOWERCASE});

    /** Instantiate CASes */
    JCas questionCas = JCasFactory.createJCas();
    JCas commentCas = JCasFactory.createJCas();

    WriteFile out = new WriteFile(dataFile + ".csv");

    Document doc = Jsoup.parse(new File(dataFile), "UTF-8");

    boolean firstRow = true;

    /** Consume data */
    Elements questions = doc.getElementsByTag("Question");

    int numberOfQuestions = questions.size();
    int questionNumber = 1;

    for (Element question : questions) {
      System.out.println("[INFO]: Processing " + questionNumber++ + " out of " + numberOfQuestions);
      /** Parse question node */
      String qid = question.attr("QID");
      String qcategory = question.attr("QCATEGORY");
      String qdate = question.attr("QDATE");
      String qsubject =
          question
              .getElementsByTag("QSubject")
              .get(0)
              .text()
              .replaceAll("/", "")
              .replaceAll("~", "");
      String qbody =
          question.getElementsByTag("QBody").get(0).text().replaceAll("/", "").replaceAll("~", "");

      /** Get analyzed text for question */
      if (USE_QCRI_ALT_TOOLS) {
        questionCas = this.getPreliminarCas(analyzer, questionCas, qid, qsubject + ". " + qbody);
      } else {
        questionCas.reset();
        questionCas.setDocumentLanguage("ar");
        questionCas.setDocumentText(qsubject + ". " + qbody);
        SimplePipeline.runPipeline(questionCas, this.analysisEngineList);
      }

      /** Parse answer nodes */
      Elements comments = question.getElementsByTag("Answer");
      for (Element comment : comments) {
        String cid = comment.attr("CID");
        String cgold = comment.attr("CGOLD");
        String cbody = comment.text().replaceAll("/", "").replaceAll("~", "");
        ;

        /** Get analyzed text for comment */
        if (USE_QCRI_ALT_TOOLS) {
          commentCas = this.getPreliminarCas(analyzer, commentCas, cid, cbody);
        } else {
          commentCas.reset();
          commentCas.setDocumentLanguage("ar");
          commentCas.setDocumentText(cbody);

          SimplePipeline.runPipeline(commentCas, this.analysisEngineList);
        }

        /** Compute features between question and comment */
        FeatureVector fv = pfArabic.getPairFeatures(questionCas, commentCas, parameterList);

        /**
         * ************************************* * * * PLUG YOUR FEATURES HERE * * * *
         * *************************************
         */

        /**
         * fv is actually an AugmentableFeatureVector from the Mallet library
         *
         * <p>Internally the features are named so you must specify an unique identifier.
         *
         * <p>An example:
         *
         * <p>((AugmentableFeatureVector) fv).add("your_super_feature_id", 42);
         *
         * <p>or:
         *
         * <p>AugmentableFeatureVector afv = (AugmentableFeatureVector) fv;
         * afv.add("your_super_feature_id", 42);
         */

        /**
         * ************************************* * * * THANKS! * * * *
         * *************************************
         */

        /** Produce output line */
        if (firstRow) {
          out.write("cid,cgold");
          for (int i = 0; i < fv.numLocations(); i++) {
            int featureIndex = i + 1;
            out.write(",f" + featureIndex);
          }
          out.write("\n");

          firstRow = false;
        }

        List<Double> features = this.serializeFv(fv);

        /** Produce output line */
        out.writeLn(qid + "-" + cid + "," + cgold + "," + Joiner.on(",").join(features));
      }
    }

    this.fm.closeFiles();
    out.close();
  }