public JCas getPreliminarCas( Analyzer analyzer, JCas emptyCas, String sentenceId, String sentence) { this.preliminaryCas.reset(); /** Without this the annotator fails badly */ sentence = sentence.replaceAll("/", ""); sentence = sentence.replaceAll("~", ""); // Carry out preliminary analysis Analyzable content = new SimpleContent(sentenceId, sentence, ArabicAnalyzer.ARABIC_LAN); analyzer.analyze(this.preliminaryCas, content); // Copy data to a new CAS and use normalized text as DocumentText emptyCas.reset(); emptyCas.setDocumentLanguage(ArabicAnalyzer.ARABIC_LAN); CasCopier.copyCas(this.preliminaryCas.getCas(), emptyCas.getCas(), false); String normalizedText = JCasUtil.selectSingle(this.preliminaryCas, NormalizedText.class).getText(); emptyCas.setDocumentText(normalizedText); return emptyCas; }
private void setupUimaTools() throws IOException, UIMAException { /* Document docTrain = JsoupUtils.getDoc(CQA_QL_TRAIN_EN); Document docDevel = JsoupUtils.getDoc(CQA_QL_DEV_EN); Document docTest = JsoupUtils.getDoc(CQA_QL_TEST_EN); this.userProfiles = UserProfile.createUserProfiles(docTrain, docDevel, docTest); //////user profiles are built on the training+dev+test semeval2015 datasets! */ // for(Entry<String, UserProfile> entry: userProfiles.entrySet()){ // if(entry.getValue().getSignatures().size()>0){ // System.out.println("---------- SIGNATURES FOR USER: "******" // ----------"); // for(String signature : entry.getValue().getSignatures()){ // System.out.println("____\n" + JsoupUtils.recoverOriginalText(signature)); // } // } // } this.stopwords = new Stopwords(Stopwords.STOPWORD_EN); this.pfEnglish = new PairFeatureFactoryEnglish(this.alphabet); this.pfEnglish.setupMeasures(RichNode.OUTPUT_PAR_LEMMA, this.stopwords); /** Add some punctuation to the stopwords list */ for (String stopword : ".|...|\\|,|?|!|#|(|)|$|%|&".split("\\|")) { this.stopwords.add(stopword); } /** Specify A and B subtask labels */ this.a_labels.add("Not English"); this.a_labels.add("Good"); this.a_labels.add("Potential"); this.a_labels.add("Dialogue"); this.a_labels.add("Bad"); this.b_labels.add("No"); this.b_labels.add("Yes"); this.b_labels.add("Unsure"); /** Create the analysis pipeline */ AnalysisEngine segmenter = createEngine(createEngineDescription(OpenNlpSegmenter.class)); AnalysisEngine postagger = createEngine(createEngineDescription(OpenNlpPosTagger.class)); AnalysisEngine chunker = createEngine(createEngineDescription(OpenNlpChunker.class)); AnalysisEngine lemmatizer = createEngine(createEngineDescription(StanfordLemmatizer.class)); this.analysisEngineList = new AnalysisEngine[4]; this.analysisEngineList[0] = segmenter; this.analysisEngineList[1] = postagger; this.analysisEngineList[2] = chunker; this.analysisEngineList[3] = lemmatizer; this.analyzer = new Analyzer(new UIMAFilePersistence("CASes/semeval")); for (AnalysisEngine ae : this.analysisEngineList) { analyzer.addAE(ae); } }
public void runForArabic() throws UIMAException { this.stopwords = new Stopwords(Stopwords.STOPWORD_AR); this.stopwords = new Stopwords("semeval2015-3/arabic-corpus-specific-stopwords.txt"); this.pfArabic = new PairFeatureFactoryArabic(this.alphabet); this.pfArabic.setupMeasures(RichNode.OUTPUT_PAR_TOKEN_LOWERCASE, this.stopwords); this.language = LANG_ARABIC; this.preliminaryCas = JCasFactory.createJCas(); /** Specify the task label For Arabic there is just one task */ this.a_labels.add("direct"); this.a_labels.add("related"); this.a_labels.add("irrelevant"); /** * Instantiate the QCRI Analyzer, but for now we are using the analysis engines instantiated * later on */ if (USE_QCRI_ALT_TOOLS) { this.analyzer = new Analyzer(new UIMANoPersistence()); analyzer.addAE( AnalysisEngineFactory.createEngine(createEngineDescription(ArabicAnalyzer.class))); } else { /** * Whitespace tokenizer. The Stanford Segmenter for Arabic has a very bad bug and the * tokenization is completely wrong. */ AnalysisEngine segmenter = createEngine(createEngineDescription(WhitespaceTokenizer.class)); /** Stanford POS-Tagger */ AnalysisEngine postagger = createEngine( createEngineDescription( StanfordPosTagger.class, StanfordPosTagger.PARAM_LANGUAGE, "ar", StanfordPosTagger.PARAM_VARIANT, "accurate")); /** Putting together the UIMA DKPro annotators */ this.analysisEngineList = new AnalysisEngine[2]; this.analysisEngineList[0] = segmenter; this.analysisEngineList[1] = postagger; } try { processArabicFile(analyzer, CQA_QL_TRAIN_AR, "train"); processArabicFile(analyzer, CQA_QL_DEV_AR, "dev"); } catch (SimilarityException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } }
public void runForEnglish() throws UIMAException { this.stopwords = new Stopwords(Stopwords.STOPWORD_EN); this.pfEnglish = new PairFeatureFactoryEnglish(this.alphabet); this.pfEnglish.setupMeasures(RichNode.OUTPUT_PAR_LEMMA, this.stopwords); this.language = LANG_ENGLISH; /** Add some punctuation to the stopwords list */ for (String stopword : ".|...|\\|,|?|!|#|(|)|$|%|&".split("\\|")) { this.stopwords.add(stopword); } /** Specify A and B subtask labels */ this.a_labels.add("Not English"); this.a_labels.add("Good"); this.a_labels.add("Potential"); this.a_labels.add("Dialogue"); this.a_labels.add("Bad"); this.b_labels.add("No"); this.b_labels.add("Yes"); this.b_labels.add("Unsure"); /** Create the analysis pipeline */ AnalysisEngine segmenter = createEngine(createEngineDescription(OpenNlpSegmenter.class)); AnalysisEngine postagger = createEngine(createEngineDescription(OpenNlpPosTagger.class)); AnalysisEngine chunker = createEngine(createEngineDescription(OpenNlpChunker.class)); AnalysisEngine lemmatizer = createEngine(createEngineDescription(StanfordLemmatizer.class)); this.analysisEngineList = new AnalysisEngine[4]; this.analysisEngineList[0] = segmenter; this.analysisEngineList[1] = postagger; this.analysisEngineList[2] = chunker; this.analysisEngineList[3] = lemmatizer; this.analyzer = new Analyzer(new UIMAFilePersistence("CASes/semeval")); for (AnalysisEngine ae : this.analysisEngineList) { analyzer.addAE(ae); } try { this.processEnglishFile(CQA_QL_TRAIN_EN, "train"); this.processEnglishFile(CQA_QL_DEV_EN, "dev"); } catch (UIMAException | IOException | SimilarityException e) { e.printStackTrace(); } }