@Test
  public void test() throws Exception {

    // write corpus
    String corpusName = "CrossvalidationReaderTest_" + currentTimeMillis();

    List<String> testInput = newArrayList();
    for (int i = 0; i < 100; i++) {
      testInput.add(i + "_");
    }

    CollectionReader cr =
        createReader(
            TextArrayReader.class, PARAM_INPUT, testInput.toArray(new String[testInput.size()]));

    runPipeline(
        cr, //
        createEngine(
            CrossvalidationWriter.class, //
            PARAM_CORPUS_NAME,
            corpusName));

    // retrieve
    CollectionReader cr2 =
        createReader(
            CrossvalidationReader.class, //
            PARAM_CORPUS_NAME,
            corpusName, //
            PARAM_MAX_NR_RESULTS,
            100, //
            PARAM_SLICE,
            1, //
            PARAM_MODE_EVAL,
            true);
    runPipeline(cr2, createEngine(CrossvalidationTest.class));
    assertEquals(
        "should have 10 documents", "95_89_77_36_35_18_25_4_87_37_", join(testCollected, ""));
    testCollected.clear();

    // retrieve2
    CollectionReader cr3 =
        createReader(
            CrossvalidationReader.class, //
            PARAM_CORPUS_NAME,
            corpusName, //
            PARAM_MAX_NR_RESULTS,
            100, //
            PARAM_SLICE,
            1, //
            PARAM_MODE_EVAL,
            false);
    runPipeline(cr3, createEngine(CrossvalidationTest.class));
    assertEquals("should have 90 documents", 90, testCollected.size());
  }
  @Before
  public void buildModel() throws Exception {
    model = folder.newFile();

    // write the model
    CollectionReaderDescription reader =
        createReaderDescription(
            TextReader.class,
            TextReader.PARAM_SOURCE_LOCATION,
            CONSUMER_TEST_DATA_PATH,
            TextReader.PARAM_PATTERNS,
            INCLUDE_PREFIX + "*.txt");

    AnalysisEngineDescription aggregate =
        createEngineDescription(
            createEngineDescription(BreakIteratorSegmenter.class),
            createEngineDescription(
                TfidfConsumer.class,
                TfidfConsumer.PARAM_FEATURE_PATH,
                Token.class,
                TfidfConsumer.PARAM_TARGET_LOCATION,
                model));

    SimplePipeline.runPipeline(reader, aggregate);
  }
 public static void main(String[] args) throws Exception {
   SimplePipeline.runPipeline(
       createReaderDescription(ConsoleReader.class, TextReader.PARAM_LANGUAGE, "en"),
       createEngineDescription(OpenNlpSegmenter.class),
       createEngineDescription(OpenNlpPosTagger.class),
       createEngineDescription(ConsoleWriter.class));
 }
  public static void main(String[] args) throws Exception {

    CollectionReader cr = createReader(WhiteTextCollectionReader.class);

    SimplePipeline.runPipeline(
        cr, AnalysisEngineFactory.createEngine(WhiteTextCollectionReaderTest.class));
  }
  private JCas computeCommentCas(Element comment) throws UIMAException {
    JCas cCas = JCasFactory.createJCas();
    String cid = comment.attr("CID");
    String cuserid = comment.attr("CUSERID");
    // String cgold = comment.attr("CGOLD");
    // String cgold = getgold(comment.attr("CGOLD"));

    // String cgold_yn = comment.attr("CGOLD_YN");
    String csubject = comment.getElementsByTag("CSubject").get(0).text();
    String cbody = comment.getElementsByTag("CBody").get(0).text();

    /** Setup comment CAS */
    cCas.reset();
    cCas.setDocumentLanguage("en");
    String commentText =
        TextNormalizer.normalize(SubjectBodyAggregator.getCommentText(csubject, cbody));
    cCas.setDocumentText(commentText);
    // cCas.setDocumentText(csubject + ". " + cbody);

    /** Run the UIMA pipeline */
    SimplePipeline.runPipeline(cCas, this.analysisEngineList);

    // this.analyzer.analyze(commentCas, new SimpleContent("c-" + cid, csubject + ". " + cbody));
    return cCas;
  }
  @Test
  public void test() throws Exception {

    AnalysisEngineDescription copyAnnots =
        createEngineDescription(
            CopyAnnotationsAnnotator2.class, //
            TO_VIEW,
            "blah", //
            PARAM_ANNOTATION_CLASS,
            Protein.class.getName(), //
            DELETE_FROM,
            true);

    JCas jCas = getTestCas();
    assertEquals("has a DocumentAnnotation at first", 1, select(jCas, Annotation.class).size());

    Protein p = new Protein(jCas, 5, 10);
    p.addToIndexes();
    runPipeline(jCas, copyAnnots);
    assertTrue("no more Protein in initial view", !exists(jCas, Protein.class));

    JCas newView = jCas.getView("blah");
    Collection<Protein> pNew = select(newView, Protein.class);
    assertEquals("copied to new view", 1, pNew.size());
    assertTrue(haveSameBeginEnd(p, pNew.iterator().next()));

    // copy it back
    AnalysisEngineDescription copyAnnotsBack =
        createEngineDescription(
            CopyAnnotationsAnnotator2.class, //
            FROM_VIEW,
            "blah", //
            TO_VIEW,
            BlueUima.VIEW_SYSTEM, //
            PARAM_ANNOTATION_CLASS,
            Protein.class.getName(), //
            DELETE_FROM,
            true);
    runPipeline(jCas, copyAnnotsBack);

    assertTrue(!exists(newView, Protein.class));
    Collection<Protein> pBack = select(jCas, Protein.class);
    assertEquals("copied to protein", 1, pBack.size());
    assertTrue(haveSameBeginEnd(p, pBack.iterator().next()));
  }
 public static void main(String[] args) throws Exception {
   SimplePipeline.runPipeline(
       CollectionReaderFactory.createReader(
           ReaderExample.class,
           ReaderExample.PARAM_INPUT_FILE,
           "src/test/resources/test/input.txt"),
       AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class),
       AnalysisEngineFactory.createEngineDescription(BaselineExample.class),
       AnalysisEngineFactory.createEngineDescription(EvaluatorExample.class));
 }
  public static void main(String[] args) throws Exception {
    JCas jCas = JCasFactory.createJCas();
    jCas.setDocumentLanguage("de");
    jCas.setDocumentText(
        "Die Fossillagerstätte Geiseltal befindet sich im ehemaligen Braunkohlerevier des Geiseltales südlich der Stadt Halle in Sachsen-Anhalt. Sie ist eine bedeutende Fundstelle heute ausgestorbener Pflanzen und Tiere aus der Zeit des Mittleren Eozäns vor 48 bis 41 Millionen Jahren. Im Geiseltal wurde nachweislich seit 1698 erstmals Kohle gefördert, die ersten Fossilien kamen aber erst Anfang des 20. Jahrhunderts eher zufällig zu Tage. Planmäßige wissenschaftliche Ausgrabungen begannen 1925 seitens der Martin-Luther-Universität Halle-Wittenberg. Unterbrochen durch den Zweiten Weltkrieg, können die Untersuchungen in zwei Forschungsphasen untergliedert werden. Aufgrund der zunehmenden Auskohlung der Rohstofflager kamen die Ausgrabungen Mitte der 1980er allmählich zum Erliegen und endeten endgültig zu Beginn des dritten Jahrtausends.");

    SimplePipeline.runPipeline(
        jCas,
        AnalysisEngineFactory.createEngineDescription(BreakIteratorSegmenter.class),
        AnalysisEngineFactory.createEngineDescription(StanfordNamedEntityRecognizer.class),
        AnalysisEngineFactory.createEngineDescription(CasDumpWriter.class));

    for (NamedEntity ne : JCasUtil.select(jCas, NamedEntity.class)) {
      System.out.println("Found NE: " + ne.getValue() + ", " + ne.getCoveredText());
    }
  }
Exemple #9
0
  @Test
  public void test() throws Exception {
    TypeSystemDescription tsd =
        TypeSystemDescriptionFactory.createTypeSystemDescription("desc.types.test-TypeSystem");

    CollectionReaderDescription colReaderDesc =
        CollectionReaderFactory.createReaderDescription(
            XmiCollectionReader.class, tsd,
            XmiCollectionReader.PARAM_INPUTDIR, inputFileXMIDir);

    // configure AE
    XMLInputSource aeDescInput = new XMLInputSource(U2BAggregateDesc);
    AnalysisEngineDescription aeDesc =
        UIMAFramework.getXMLParser().parseAnalysisEngineDescription(aeDescInput);

    SimplePipeline.runPipeline(colReaderDesc, aeDesc);
  }
Exemple #10
0
  public static void main(String[] args) throws Exception {
    Options options = CliFactory.parseArguments(Options.class, args);

    List<File> testFiles =
        DocumentClassificationEvaluation.getFilesFromDirectory(options.getTestDirectory());

    DocumentClassificationEvaluation evaluation =
        new DocumentClassificationEvaluation(options.getModelsDirectory());
    CollectionReader collectionReader = evaluation.getCollectionReader(testFiles);

    AggregateBuilder builder =
        DocumentClassificationEvaluation.createDocumentClassificationAggregate(
            options.getModelsDirectory(), AnnotatorMode.CLASSIFY);

    SimplePipeline.runPipeline(
        collectionReader,
        builder.createAggregateDescription(),
        AnalysisEngineFactory.createEngineDescription(PrintClassificationsAnnotator.class));
  }
  public static void main(String[] args) throws IOException, UIMAException {
    String inputDir = args.length > 0 ? args[0] : DEFAULT_SOURCE;

    CollectionReaderDescription reader =
        createReaderDescription(
            TextReader.class,
            TextReader.PARAM_SOURCE_LOCATION,
            inputDir,
            TextReader.PARAM_LANGUAGE,
            LANGUAGE);
    AnalysisEngineDescription segmenter = createEngineDescription(OpenNlpSegmenter.class);
    AnalysisEngineDescription stopwordRemover =
        createEngineDescription(
            StopWordRemover.class, StopWordRemover.PARAM_MODEL_LOCATION, STOPWORD_FILE);
    AnalysisEngineDescription embeddingsAnnotator =
        createEngineDescription(
            MalletEmbeddingsAnnotator.class,
            MalletEmbeddingsAnnotator.PARAM_MODEL_LOCATION,
            EMBEDDINGS_FILE);
    AnalysisEngineDescription writer =
        createEngineDescription(CasDumpWriter.class, CasDumpWriter.PARAM_OUTPUT_FILE, OUTPUT_FILE);

    SimplePipeline.runPipeline(reader, segmenter, stopwordRemover, embeddingsAnnotator, writer);
  }
  public void testOneWay(String aExpectedFile, String aFile, Object... aExtraParams)
      throws Exception {
    File reference = new File("src/test/resources/stanfordPennTrees/" + aExpectedFile);
    File input = new File("src/test/resources/stanfordPennTrees/" + aFile);
    File output = new File("target/test-output/" + name.getMethodName());

    List<Object> extraReaderParams = new ArrayList<>();
    extraReaderParams.add(PennTreebankCombinedReader.PARAM_SOURCE_LOCATION);
    extraReaderParams.add(input);
    extraReaderParams.addAll(asList(aExtraParams));

    CollectionReaderDescription reader =
        createReaderDescription(PennTreebankCombinedReader.class, extraReaderParams.toArray());

    List<Object> extraWriterParams = new ArrayList<>();
    extraWriterParams.add(PennTreebankCombinedWriter.PARAM_TARGET_LOCATION);
    extraWriterParams.add(output);
    extraWriterParams.add(PennTreebankCombinedWriter.PARAM_STRIP_EXTENSION);
    extraWriterParams.add(true);
    extraWriterParams.addAll(asList(aExtraParams));

    AnalysisEngineDescription writer =
        createEngineDescription(PennTreebankCombinedWriter.class, extraWriterParams.toArray());

    if (canParameterBeSet(writer, "overwrite")) {
      setParameter(writer, "overwrite", true);
    }

    runPipeline(reader, writer);

    String expected = FileUtils.readFileToString(reference, "UTF-8");
    String actual =
        FileUtils.readFileToString(
            new File(output, FilenameUtils.getBaseName(input.toString()) + ".penn"), "UTF-8");
    assertEquals(expected.trim(), actual.trim());
  }
  public static void main(String[] args) throws UIMAException, IOException {
    // For our corpus and answer key we will use the Senseval-2 English
    // all-words test data. You need to obtain this data set from the
    // Senseval-2 website. Change the value of the directory variable
    // to point to the location on your filesystem where you stored the
    // data set.
    final String directory =
        "/home/miller/workspace/de.tudarmstadt.ukp.experiments.tm.wsdcorpora/src/main/resources/senseval-2/english-all-words/test/";
    final String corpus = directory + "eng-all-words.test.xml";
    final String answerkey = directory + "eng-all-words.test.key";

    // This is a collection reader for the documents to be disambiguated.
    // The original corpus contains errors so we instruct the collection
    // reader to ignore them.
    CollectionReader reader =
        createReader(
            Senseval2AWReader.class,
            Senseval2AWReader.PARAM_FILE,
            corpus,
            Senseval2AWReader.PARAM_IGNORE_MISSING_SATELLITES,
            true);

    // This Senseval data set uses an unpublished pre-release version of
    // WordNet 1.7 as its sense inventory. This pre-release version is
    // lost. Here we use WordNet 1.7 instead, though some of the sense
    // keys are slightly different. You need to create an extJWNL
    // properties file and change the value of the
    // PARAM_WORDNET_PROPERTIES_URL to point to its location on your file
    // system.
    final String wordnetInventoryName = "WordNet_1.7pre_sensekey";
    ExternalResourceDescription wordnet1_7 =
        createExternalResourceDescription(
            WordNetSenseKeySenseInventoryResource.class,
            WordNetSenseKeySenseInventoryResource.PARAM_WORDNET_PROPERTIES_URL,
            "/home/miller/share/WordNet/WordNet-1.7pre/extjwnl_properties.xml",
            WordNetSenseKeySenseInventoryResource.PARAM_SENSE_INVENTORY_NAME,
            wordnetInventoryName);

    // This AE reads the Senseval-2 answer key. Because the Senseval
    // answer key format doesn't itself indicate what sense inventory is
    // used for the keys, we need to pass this as a configuration parameter.
    // In this case, the keys use sense identifiers which are specific
    // to the Senseval task, so we give it a unique ID.
    final String sensevalInventoryName = "Senseval2_sensekey";
    AnalysisEngineDescription answerReader =
        createEngineDescription(
            SensevalAnswerKeyReader.class,
            SensevalAnswerKeyReader.PARAM_FILE,
            answerkey,
            SensevalAnswerKeyReader.PARAM_SENSE_INVENTORY,
            sensevalInventoryName);

    // The Senseval-2 sense identifiers are based on (but subtly different
    // from) sense keys from the WordNet 1.7-prerelease.  We therefore
    // use this AE  to convert them to WordNet 1.7-prerelease sense keys. We
    // have a delimited text file providing a mapping between the two
    // sense identifiers, which the SenseMapper annotator reads in and
    // uses to perform the conversion.
    AnalysisEngineDescription convertSensevalToSensekey =
        createEngineDescription(
            SenseMapper.class,
            SenseMapper.PARAM_FILE,
            "classpath:/wordnet_senseval.tsv",
            SenseMapper.PARAM_SOURCE_SENSE_INVENTORY_NAME,
            sensevalInventoryName,
            SenseMapper.PARAM_TARGET_SENSE_INVENTORY_NAME,
            wordnetInventoryName,
            SenseMapper.PARAM_KEY_COLUMN,
            2,
            SenseMapper.PARAM_VALUE_COLUMN,
            1,
            SenseMapper.PARAM_IGNORE_UNKNOWN_SENSES,
            true);

    // Here's a resource encapsulating the random sense baseline algorithm.
    ExternalResourceDescription randomBaselineResource =
        createExternalResourceDescription(
            WSDResourceIndividualBasic.class,
            WSDResourceIndividualBasic.SENSE_INVENTORY_RESOURCE,
            wordnet1_7,
            WSDResourceIndividualBasic.DISAMBIGUATION_METHOD,
            RandomSenseBaseline.class.getName());

    AnalysisEngineDescription randomBaseline =
        createEngineDescription(
            WSDAnnotatorIndividualBasic.class,
            WSDAnnotatorIndividualBasic.WSD_ALGORITHM_RESOURCE,
            randomBaselineResource);

    // This AE prints out detailed information on the AEs' sense
    // assignments. It's excluded from the pipeline by default as it
    // produces quite a lot of output.
    @SuppressWarnings("unused")
    AnalysisEngineDescription writer = createEngineDescription(WSDWriter.class);

    // This AE compares the sense assignments of all algorithms against
    // the given gold standard (in this case, the answer key we read in)
    // and computes and prints out useful statistics, such as precision,
    // recall, and coverage.
    AnalysisEngineDescription evaluator =
        createEngineDescription(
            MultipleExactMatchEvaluator.class,
            MultipleExactMatchEvaluator.PARAM_GOLD_STANDARD_ALGORITHM,
            answerkey);

    // Here we run the pipeline
    SimplePipeline.runPipeline(
        reader,
        answerReader,
        convertSensevalToSensekey,
        randomBaseline,
        // writer,
        evaluator);
  }
  // TODO the method should be private
  public void processEnglishFile(Document doc, String dataFile, String suffix)
      throws ResourceInitializationException, UIMAException, IOException,
          AnalysisEngineProcessException, SimilarityException {

    String plainTextOutputPath = dataFile + "plain.txt";
    String goodVSbadOutputPath = dataFile + ".csv";
    String pairwiseOutputPath = dataFile + getPairwiseSuffix();
    String kelpFilePath = dataFile + ".klp";

    /** Marker which adds relational information to a pair of trees */
    MarkTreesOnRepresentation marker = new MarkTreesOnRepresentation(new MarkTwoAncestors());

    /** Load stopwords for english */
    marker.useStopwords(Stopwords.STOPWORD_EN);

    /** Tree serializer for converting tree structures to string */
    TreeSerializer ts = new TreeSerializer().enableRelationalTags().useRoundBrackets();

    /// ** Instantiate CASes */ assigned in the for loop
    // JCas questionCas = JCasFactory.createJCas();

    // WriteFile out = new WriteFile(dataFile + ".csv");
    // TODO ABC, Sep 10th 2015. Do we really need this? It seems like a bad patch
    doc.select("QURAN").remove();
    doc.select("HADEETH").remove();

    boolean firstRow = true;

    /** Consume data */
    Elements questions = doc.getElementsByTag("Question");
    int numberOfQuestions = questions.size();
    int qNumber = 1;

    for (Element question : questions) {
      System.out.println("[INFO]: Processing " + qNumber++ + " out of " + numberOfQuestions);

      CQAinstance cqainstance = qElementToObject(question);

      getFeaturesFromThread(cqainstance);
      // TODO MOVE FROM HERE TO getFeaturesFromThread.
      // FOR THAT the printing operations have to be moved out and
      // question and comment must have a method to extract header+body.
      // Move them from SubjectBodyAggregator
      // AQUI VOY
      /** Setup question CAS */

      // questionCas.reset();
      JCas questionCas = cqaElementToCas(cqainstance.getQuestion());

      fm.writeLn(
          plainTextOutputPath,
          "---------------------------- QID: "
              + cqainstance.getQuestion().getId()
              + " USER:"******"q-" + qid, qsubject + ". " + qbody));

      /*Comment-level features to be combined*/
      List<List<Double>> listFeatures = new ArrayList<List<Double>>();
      List<Map<String, Double>> albertoSimoneFeatures;
      if (GENERATE_ALBERTO_AND_SIMONE_FEATURES) { // TODO RENAME THIS PLEASE
        albertoSimoneFeatures = FeatureExtractor.extractFeatures(cqainstance);
      }

      int commentIndex = 0;
      List<JCas> allCommentsCas = new ArrayList<JCas>();
      for (CQAcomment c : cqainstance.getComments()) {
        /** Setup comment CAS */
        JCas commentCas = cqaElementToCas(c);

        /** Run the UIMA pipeline */
        SimplePipeline.runPipeline(commentCas, this.analysisEngineList);
        // this.analyzer.analyze(commentCas, new SimpleContent("c-" + cid, csubject + ". " +
        // cbody));

        AugmentableFeatureVector fv;
        if (GENERATE_MASSIMO_FEATURES) {
          fv =
              (AugmentableFeatureVector)
                  pfEnglish.getPairFeatures(questionCas, commentCas, PARAMETER_LIST);
        } else {
          fv = new AugmentableFeatureVector(this.alphabet);
        }

        if (GENERATE_ALBERTO_AND_SIMONE_FEATURES) {
          Map<String, Double> featureVector = albertoSimoneFeatures.get(commentIndex);
          for (String featureName : FeatureExtractor.getAllFeatureNames()) {
            Double value = featureVector.get(featureName);
            double featureValue = 0;
            if (value != null) {
              featureValue = value;
            }
            fv.add(featureName, featureValue);
          }
        }
        commentIndex++;

        /**
         * ************************************* * * * PLUG YOUR FEATURES HERE * * * *
         * *************************************
         */

        /**
         * fv is actually an AugmentableFeatureVector from the Mallet library
         *
         * <p>Internally the features are named so you must specify an unique identifier.
         *
         * <p>An example:
         *
         * <p>((AugmentableFeatureVector) fv).add("your_super_feature_id", 42);
         *
         * <p>or:
         *
         * <p>AugmentableFeatureVector afv = (AugmentableFeatureVector) fv;
         * afv.add("your_super_feature_id", 42);
         */

        // ((AugmentableFeatureVector) fv).add("quseridEqCuserid", quseridEqCuserid);

        /**
         * ************************************* * * * THANKS! * * * *
         * *************************************
         */

        /** Produce outputs */
        writeToPlainTextOutput(plainTextOutputPath, c, commentCas);

        //				String goodVSbadOutputPath = dataFile + ".csv";
        //				String pairwiseOutputPath

        // FIXME Once we fix that issue with the features, we can know this info
        // in advance and fix the output, probably out of the method
        if (firstRow) {
          // header for Good vs Bad
          this.fm.write(goodVSbadOutputPath, "qid,cgold,cgold_yn");
          for (int i = 0; i < fv.numLocations(); i++) {
            int featureIndex = i + 1;
            this.fm.write(goodVSbadOutputPath, ",f" + featureIndex);
          }
          this.fm.writeLn(goodVSbadOutputPath, "");

          // header for pairwise
          this.fm.write(pairwiseOutputPath, "qid,cgold");
          int numFeatures = fv.numLocations();
          if (COMBINATION_CONCAT) {
            numFeatures *= 2;
          }
          if (INCLUDE_SIMILARITIES) {
            numFeatures += PairFeatureFactoryEnglish.NUM_SIM_FEATURES;
          }

          for (int i = 0; i < numFeatures; i++) {
            int featureIndex = i + 1;
            this.fm.write(pairwiseOutputPath, ",f" + featureIndex);
          }
          this.fm.writeLn(pairwiseOutputPath, "");

          firstRow = false;
        }

        List<Double> features = this.serializeFv(fv);
        listFeatures.add(features);

        this.fm.writeLn(
            goodVSbadOutputPath,
            c.getId()
                + ","
                + c.getGold()
                + ","
                + c.getGold_yn()
                + ","
                + Joiner.on(",").join(features));

        /** Produce also the file needed to train structural models */
        if (PRODUCE_SVMLIGHTTK_DATA) {
          produceSVMLightTKExample(
              questionCas,
              commentCas,
              suffix,
              ts,
              cqainstance.getQuestion().getId(),
              c.getId(),
              c.getGold(),
              c.getGold_yn(),
              features);
        }
        if (PRODUCE_KELP_DATA) {
          produceKelpExample(
              questionCas,
              commentCas,
              kelpFilePath,
              ts,
              cqainstance.getQuestion().getId(),
              c.getId(),
              c.getGold(),
              c.getGold_yn(),
              features);
        }
        allCommentsCas.add(commentCas);
      }
      // TODO MOVE UP TO HERE

      this.fm.write(
          pairwiseOutputPath, computePairwiseFeatures(cqainstance, listFeatures, allCommentsCas));
      // out.writeLn(computePairwiseFeatures(q, listFeatures);
    }

    //		Iterator<String> iterator = questionCategories.iterator();
    //		while(iterator.hasNext()){
    //			System.out.println("CATEGORY_" + iterator.next());
    //		}

    this.fm.closeFiles();
  }
Exemple #15
0
  /**
   * Process the xml file and output a csv file with the results in the same directory
   *
   * @param dataFile the xml file to process
   * @suffix suffix for identifying the data file
   * @param suffix
   * @throws ResourceInitializationException
   * @throws UIMAException
   * @throws IOException
   * @throws AnalysisEngineProcessException
   * @throws SimilarityException
   */
  private void processEnglishFile(String dataFile, String suffix)
      throws ResourceInitializationException, UIMAException, IOException,
          AnalysisEngineProcessException, SimilarityException {

    /** Parameters for matching tree structures */
    String parameterList =
        Joiner.on(",")
            .join(new String[] {RichNode.OUTPUT_PAR_LEMMA, RichNode.OUTPUT_PAR_TOKEN_LOWERCASE});

    /** Marker which adds relational information to a pair of trees */
    MarkTreesOnRepresentation marker = new MarkTreesOnRepresentation(new MarkTwoAncestors());

    /** Load stopwords for english */
    marker.useStopwords(Stopwords.STOPWORD_EN);

    /** Tree serializer for converting tree structures to string */
    TreeSerializer ts = new TreeSerializer().enableRelationalTags().useRoundBrackets();

    /** Instantiate CASes */
    JCas questionCas = JCasFactory.createJCas();
    JCas commentCas = JCasFactory.createJCas();

    WriteFile out = new WriteFile(dataFile + ".csv");

    Document doc = Jsoup.parse(new File(dataFile), "UTF-8");

    doc.select("QURAN").remove();
    doc.select("HADEETH").remove();

    boolean firstRow = true;

    /** Consume data */
    Elements questions = doc.getElementsByTag("Question");
    int numberOfQuestions = questions.size();
    int questionNumber = 1;

    Map<String, Boolean> commentIsDialogue = new HashMap<>();

    for (Element question : questions) {
      System.out.println("[INFO]: Processing " + questionNumber++ + " out of " + numberOfQuestions);
      /** Parse question node */
      String qid = question.attr("QID");
      String qcategory = question.attr("QCATEGORY");
      String qdate = question.attr("QDATE");
      String quserid = question.attr("QUSERID");
      String qtype = question.attr("QTYPE");
      String qgold_yn = question.attr("QGOLD_YN");
      String qsubject = question.getElementsByTag("QSubject").get(0).text();
      String qbody = question.getElementsByTag("QBody").get(0).text();

      /** Setup question CAS */
      questionCas.reset();
      questionCas.setDocumentLanguage("en");
      questionCas.setDocumentText(qsubject + ". " + qbody);

      /** Run the UIMA pipeline */
      SimplePipeline.runPipeline(questionCas, this.analysisEngineList);

      // this.analyzer.analyze(questionCas, new SimpleContent("q-" + qid, qsubject + ". " + qbody));

      /** Parse comment nodes */
      Elements comments = question.getElementsByTag("Comment");
      for (Element comment : comments) {
        String cid = comment.attr("CID");
        String cuserid = comment.attr("CUSERID");
        String cgold = comment.attr("CGOLD");
        String cgold_yn = comment.attr("CGOLD_YN");
        String csubject = comment.getElementsByTag("CSubject").get(0).text();
        String cbody = comment.getElementsByTag("CBody").get(0).text();

        /** Setup comment CAS */
        commentCas.reset();
        commentCas.setDocumentLanguage("en");
        commentCas.setDocumentText(csubject + ". " + cbody);

        /** Run the UIMA pipeline */
        SimplePipeline.runPipeline(commentCas, this.analysisEngineList);

        // this.analyzer.analyze(commentCas, new SimpleContent("c-" + cid, csubject + ". " +
        // cbody));

        FeatureVector fv = pfEnglish.getPairFeatures(questionCas, commentCas, parameterList);

        /**
         * ************************************* * * * PLUG YOUR FEATURES HERE * * * *
         * *************************************
         */

        /**
         * fv is actually an AugmentableFeatureVector from the Mallet library
         *
         * <p>Internally the features are named so you must specify an unique identifier.
         *
         * <p>An example:
         *
         * <p>((AugmentableFeatureVector) fv).add("your_super_feature_id", 42);
         *
         * <p>or:
         *
         * <p>AugmentableFeatureVector afv = (AugmentableFeatureVector) fv;
         * afv.add("your_super_feature_id", 42);
         */
        boolean quseridEqCuserid = quserid.equals(cuserid);
        if (quseridEqCuserid) {
          commentIsDialogue.put(cid, true);
        }

        // ((AugmentableFeatureVector) fv).add("quseridEqCuserid", quseridEqCuserid);

        /**
         * ************************************* * * * THANKS! * * * *
         * *************************************
         */

        /** Produce output line */
        if (firstRow) {
          out.write("qid,cgold,cgold_yn");
          for (int i = 0; i < fv.numLocations(); i++) {
            int featureIndex = i + 1;
            out.write(",f" + featureIndex);
          }
          out.write("\n");

          firstRow = false;
        }

        List<Double> features = this.serializeFv(fv);

        out.writeLn(cid + "," + cgold + "," + cgold_yn + "," + Joiner.on(",").join(features));

        /** Produce also the file needed to train structural models */
        if (PRODUCE_SVMLIGHTTK_DATA) {
          produceSVMLightTKExample(
              questionCas, commentCas, suffix, ts, qid, cid, cgold, cgold_yn, features);
        }
      }
    }

    for (String commentId : commentIsDialogue.keySet()) {
      this.fm.writeLn(dataFile + ".dialogue.txt", commentId);
    }

    this.fm.closeFiles();
    out.close();
  }
Exemple #16
0
  public void processArabicFile(Analyzer analyzer, String dataFile, String suffix)
      throws SimilarityException, UIMAException, IOException {
    /** We do not have a lemmatizer so we work with tokens */
    String parameterList = Joiner.on(",").join(new String[] {RichNode.OUTPUT_PAR_TOKEN_LOWERCASE});

    /** Instantiate CASes */
    JCas questionCas = JCasFactory.createJCas();
    JCas commentCas = JCasFactory.createJCas();

    WriteFile out = new WriteFile(dataFile + ".csv");

    Document doc = Jsoup.parse(new File(dataFile), "UTF-8");

    boolean firstRow = true;

    /** Consume data */
    Elements questions = doc.getElementsByTag("Question");

    int numberOfQuestions = questions.size();
    int questionNumber = 1;

    for (Element question : questions) {
      System.out.println("[INFO]: Processing " + questionNumber++ + " out of " + numberOfQuestions);
      /** Parse question node */
      String qid = question.attr("QID");
      String qcategory = question.attr("QCATEGORY");
      String qdate = question.attr("QDATE");
      String qsubject =
          question
              .getElementsByTag("QSubject")
              .get(0)
              .text()
              .replaceAll("/", "")
              .replaceAll("~", "");
      String qbody =
          question.getElementsByTag("QBody").get(0).text().replaceAll("/", "").replaceAll("~", "");

      /** Get analyzed text for question */
      if (USE_QCRI_ALT_TOOLS) {
        questionCas = this.getPreliminarCas(analyzer, questionCas, qid, qsubject + ". " + qbody);
      } else {
        questionCas.reset();
        questionCas.setDocumentLanguage("ar");
        questionCas.setDocumentText(qsubject + ". " + qbody);
        SimplePipeline.runPipeline(questionCas, this.analysisEngineList);
      }

      /** Parse answer nodes */
      Elements comments = question.getElementsByTag("Answer");
      for (Element comment : comments) {
        String cid = comment.attr("CID");
        String cgold = comment.attr("CGOLD");
        String cbody = comment.text().replaceAll("/", "").replaceAll("~", "");
        ;

        /** Get analyzed text for comment */
        if (USE_QCRI_ALT_TOOLS) {
          commentCas = this.getPreliminarCas(analyzer, commentCas, cid, cbody);
        } else {
          commentCas.reset();
          commentCas.setDocumentLanguage("ar");
          commentCas.setDocumentText(cbody);

          SimplePipeline.runPipeline(commentCas, this.analysisEngineList);
        }

        /** Compute features between question and comment */
        FeatureVector fv = pfArabic.getPairFeatures(questionCas, commentCas, parameterList);

        /**
         * ************************************* * * * PLUG YOUR FEATURES HERE * * * *
         * *************************************
         */

        /**
         * fv is actually an AugmentableFeatureVector from the Mallet library
         *
         * <p>Internally the features are named so you must specify an unique identifier.
         *
         * <p>An example:
         *
         * <p>((AugmentableFeatureVector) fv).add("your_super_feature_id", 42);
         *
         * <p>or:
         *
         * <p>AugmentableFeatureVector afv = (AugmentableFeatureVector) fv;
         * afv.add("your_super_feature_id", 42);
         */

        /**
         * ************************************* * * * THANKS! * * * *
         * *************************************
         */

        /** Produce output line */
        if (firstRow) {
          out.write("cid,cgold");
          for (int i = 0; i < fv.numLocations(); i++) {
            int featureIndex = i + 1;
            out.write(",f" + featureIndex);
          }
          out.write("\n");

          firstRow = false;
        }

        List<Double> features = this.serializeFv(fv);

        /** Produce output line */
        out.writeLn(qid + "-" + cid + "," + cgold + "," + Joiner.on(",").join(features));
      }
    }

    this.fm.closeFiles();
    out.close();
  }