/** * Lemmatise a phrase or word. If a phrase, only lemmatise the most RHS word. * * @param value * @return */ public String normalise(String value) { if (value.indexOf(" ") == -1 || value.endsWith(" s") || value.endsWith( "'s")) // if string is a single word, or it is in "XYZ's" form where the ' char has been // removed return lemmatizer.lemmatize(value, 1).trim(); String part1 = value.substring(0, value.lastIndexOf(" ")); String part2 = lemmatizer.lemmatize(value.substring(value.lastIndexOf(" ") + 1), 1); return part1 + " " + part2.trim(); }
/** * Outputs the settings for this configuration to the console, very useful for ensuring the * configuration is set as desired prior to a training run */ public void log() { System.out.println( "Lemmatiser: " + (lemmatiser == null ? null : lemmatiser.getClass().getName())); System.out.println("POSTagger: " + (posTagger == null ? null : posTagger.getClass().getName())); System.out.println("Tokenizer: " + tokenizer.getClass().getName()); System.out.println("Tag format: " + tagFormat.name()); System.out.println( "PostProcessor: " + (postProcessor == null ? null : postProcessor.getClass().getName())); System.out.println("Using numeric normalization: " + useNumericNormalization); System.out.println("CRF order is " + order); System.out.println("Using feature induction: " + useFeatureInduction); System.out.println("Text textDirection: " + textDirection); }
/** * @param value original word * @param pos the part of speech of the last word * @return the lemma of original word */ public String getLemma(String value, String pos) { int POS = tagLookUp.get(pos); if (POS == 0) return lemmatizer.lemmatize(value); else return lemmatizer.lemmatize(value, POS); }