private void assertBadWithMessage(String s, String expectedErrorSubstring) throws IOException { assertEquals(1, rule.match(langTool.getAnalyzedSentence(s)).length); final String errorMessage = rule.match(langTool.getAnalyzedSentence(s))[0].getMessage(); assertTrue( "Got error '" + errorMessage + "', expected substring '" + expectedErrorSubstring + "'", errorMessage.contains(expectedErrorSubstring)); }
public void testCompareLists() throws IOException { AnalyzedSentence sentence1 = langTool.getAnalyzedSentence("Hier ein Test"); assertTrue( rule.compareLists( sentence1.getTokensWithoutWhitespace(), 0, 2, new String[] {"", "Hier", "ein"})); assertTrue( rule.compareLists( sentence1.getTokensWithoutWhitespace(), 1, 2, new String[] {"Hier", "ein"})); assertTrue( rule.compareLists( sentence1.getTokensWithoutWhitespace(), 0, 3, new String[] {"", "Hier", "ein", "Test"})); assertFalse( rule.compareLists( sentence1.getTokensWithoutWhitespace(), 0, 4, new String[] {"", "Hier", "ein", "Test"})); AnalyzedSentence sentence2 = langTool.getAnalyzedSentence("das Heilige Römische Reich"); assertTrue( rule.compareLists( sentence2.getTokensWithoutWhitespace(), 0, 4, new String[] {"", "das", "Heilige", "Römische", "Reich"})); assertFalse( rule.compareLists( sentence2.getTokensWithoutWhitespace(), 8, 11, new String[] {"", "das", "Heilige", "Römische", "Reich"})); }
public static void main(String[] args) throws TwitterException, IOException { Twitter twitter = TwitterFactory.getSingleton(); JLanguageTool langTool = new JLanguageTool(new AmericanEnglish()); List<String> twts = new ArrayList<String>(); for (String arg : args) { Query query = new Query(arg); QueryResult result; int counter = 0; do { result = twitter.search(query); List<Status> tweets = result.getTweets(); for (Status tweet : tweets) { if (isEligible(tweet)) { System.out.println("@" + tweet.getUser().getScreenName() + " - " + tweet.getText()); System.out.println(tweet.getLang()); twts.add(tweet.getText()); counter++; } } } while ((query = result.nextQuery()) != null && counter < 5); } for (String str : twts) { List<RuleMatch> matches = langTool.check(str); for (RuleMatch match : matches) { System.out.println( "Potential error at line " + match.getLine() + ", column " + match.getColumn() + ": " + match.getMessage()); System.out.println("Suggested correction: " + match.getSuggestedReplacements()); } } }
public void testRegression() throws IOException { JLanguageTool gramCheckerEngine = new JLanguageTool(new German()); gramCheckerEngine.activateDefaultPatternRules(); // used to be not detected > 1.0.1: String str = "Und so.\r\nDie Bier."; List<RuleMatch> matches = gramCheckerEngine.check(str); assertEquals(1, matches.size()); }
private static void tagText(final String contents, final JLanguageTool lt) throws IOException { AnalyzedSentence analyzedText; final List<String> sentences = lt.sentenceTokenize(contents); for (final String sentence : sentences) { analyzedText = lt.getAnalyzedSentence(sentence); System.out.println(getSentence(analyzedText)); } }
public void testPositions() throws IOException { final AccentuationCheckRule rule = new AccentuationCheckRule(TestTools.getEnglishMessages()); final RuleMatch[] matches; final JLanguageTool langTool = new JLanguageTool(new Catalan()); matches = rule.match(langTool.getAnalyzedSentence("Són circumstancies extraordinàries.")); assertEquals(4, matches[0].getFromPos()); assertEquals(18, matches[0].getToPos()); }
public void testChunker() throws Exception { JLanguageTool lt = new JLanguageTool(new Ukrainian()); AnalyzedSentence analyzedSentence = lt.getAnalyzedSentence("Для годиться."); AnalyzedSentence disambiguated = chunker.disambiguate(analyzedSentence); AnalyzedTokenReadings[] tokens = disambiguated.getTokens(); assertTrue(tokens[1].getReadings().toString().contains("<adv>")); assertTrue(tokens[4].getReadings().toString().contains("</adv>")); }
private CombiningTagger getCombiningTagger(boolean overwrite) throws IOException { ManualTagger tagger1 = new ManualTagger( JLanguageTool.getDataBroker().getFromResourceDirAsStream("/xx/added1.txt")); ManualTagger tagger2 = new ManualTagger( JLanguageTool.getDataBroker().getFromResourceDirAsStream("/xx/added2.txt")); return new CombiningTagger(tagger1, tagger2, overwrite); }
private JLanguageTool getLanguageToolWithOneRule(Language lang, PatternRule patternRule) throws IOException { final JLanguageTool langTool = new JLanguageTool(lang); for (Rule rule : langTool.getAllActiveRules()) { langTool.disableRule(rule.getId()); } langTool.addRule(patternRule); return langTool; }
private void setUpRule(Language language) throws IOException { langTool = new JLanguageTool(language); for (Rule rule : langTool.getAllRules()) { langTool.disableRule(rule.getId()); } GenericUnpairedBracketsRule rule = new GenericUnpairedBracketsRule(TestTools.getEnglishMessages(), language); langTool.addRule(rule); }
public void testRule() throws IOException { // correct sentences: // assertCorrect("els etiquetadors sobre els etiquetats."); assertCorrect("tot tenyit amb llum de nostàlgia"); assertCorrect("Ho van fer per duplicat."); assertCorrect("Assecat el braç del riu"); assertCorrect("el llibre empaquetat"); assertCorrect("un resultat equilibrat"); assertCorrect("el nostre equip era bastant equilibrat"); assertCorrect("un llibre ben empaquetat"); assertCorrect("l'informe filtrat pel ministre"); assertCorrect("L'informe filtrat és terrible"); assertCorrect("ha liderat la batalla"); assertCorrect("Els tinc empaquetats"); assertCorrect("amb tractament unitari i equilibrat"); assertCorrect("Processat després de la mort de Carles II"); assertCorrect("Processat diverses vegades"); assertCorrect("moltes vegades empaquetat amb pressa"); assertCorrect("és llavors embotellat i llançat al mercat"); assertCorrect("la comercialització de vi embotellat amb les firmes comercials"); assertCorrect("eixia al mercat el vi blanc embotellat amb la marca"); assertCorrect("que arribi a un equilibrat matrimoni"); assertCorrect("És un cafè amb molt de cos i molt equilibrat."); assertCorrect("i per tant etiquetat com a observat"); assertCorrect("Molt equilibrat en les seves característiques"); assertCorrect("filtrat per Wikileaks"); assertCorrect("una vegada filtrat"); assertCorrect("no equilibrat"); // errors: assertIncorrect("Assecat del braç del riu"); assertIncorrect("Cal vigilar el filtrat del vi"); assertIncorrect("El procés d'empaquetat"); assertIncorrect("Els equilibrats de les rodes"); // assertIncorrect("Duplicat de claus"); assertIncorrect("El procés d'etiquetat de les ampolles"); assertIncorrect("El rentat de cotes"); RuleMatch[] matches = rule.match(langTool.getAnalyzedSentence("El repicat i el rejuntat.")); assertEquals(2, matches.length); matches = rule.match(langTool.getAnalyzedSentence("El procés de relligat dels llibres.")); assertEquals(1, matches.length); assertEquals("relligadura", matches[0].getSuggestedReplacements().get(0)); assertEquals("relligament", matches[0].getSuggestedReplacements().get(1)); assertEquals("relligada", matches[0].getSuggestedReplacements().get(2)); matches = rule.match(langTool.getAnalyzedSentence("Els rentats de cervell.")); assertEquals(1, matches.length); assertEquals("rentades", matches[0].getSuggestedReplacements().get(0)); assertEquals("rentatges", matches[0].getSuggestedReplacements().get(1)); assertEquals("rentaments", matches[0].getSuggestedReplacements().get(2)); }
@Test public void testRule() throws IOException { DifferentPunctuationRule rule = new DifferentPunctuationRule(); RuleMatch[] matches; JLanguageTool srcLangTool = new JLanguageTool(TestTools.getDemoLanguage()); JLanguageTool trgLangTool = new JLanguageTool(new FakeLanguage()); rule.setSourceLanguage(TestTools.getDemoLanguage()); // correct sentences: matches = rule.match( srcLangTool.getAnalyzedSentence("This is a test sentence!"), trgLangTool.getAnalyzedSentence("C'est la vie!")); assertEquals(0, matches.length); matches = rule.match( srcLangTool.getAnalyzedSentence("one sentence"), trgLangTool.getAnalyzedSentence("jedno zdanie")); assertEquals(0, matches.length); // incorrect sentences: matches = rule.match( srcLangTool.getAnalyzedSentence("This this is a test sentence."), trgLangTool.getAnalyzedSentence("This this is a test sentence!")); assertEquals(1, matches.length); }
public void testMultipleSentences() throws IOException { final JLanguageTool tool = new JLanguageTool(new Catalan()); tool.enableRule("CA_UNPAIRED_BRACKETS"); List<RuleMatch> matches; matches = tool.check( "Aquesta és una sentència múltiple amb claudàtors: " + "[Ací hi ha un claudàtor. Amb algun text.] i ací continua.\n"); assertEquals(0, matches.size()); matches = tool.check("\"Sóc la teva filla. El corcó no et rosegarà més.\"\n\n"); assertEquals(0, matches.size()); matches = tool.check("\"Sóc la teva filla. El corcó no et rosegarà més\".\n\n"); assertEquals(0, matches.size()); matches = tool.check( "Aquesta és una sentència múltiple amb claudàtors: " + "[Ací hi ha un claudàtor. Amb algun text. I ací continua.\n\n"); assertEquals(1, matches.size()); matches = tool.check("«Els manaments diuen: \"No desitjaràs la dona del teu veí\"»"); // assertEquals(0, matches.size()); // now with a paragraph end inside - we get two alarms because of paragraph // resetting matches = tool.check( "Aquesta és una sentència múltiple amb parèntesis " + "(Ací hi ha un parèntesi. \n\n Amb algun text.) i ací continua."); assertEquals(2, matches.size()); }
@Test public void testRule() throws IOException { final MixedAlphabetsRule rule = new MixedAlphabetsRule(TestTools.getMessages("uk")); final JLanguageTool langTool = new JLanguageTool(new Ukrainian()); // correct sentences: assertEquals(0, rule.match(langTool.getAnalyzedSentence("сміття")).length); // incorrect sentences: RuleMatch[] matches = rule.match(langTool.getAnalyzedSentence("смi\u00ADття")); // check match positions: assertEquals(1, matches.length); assertEquals(Arrays.asList("сміття"), matches[0].getSuggestedReplacements()); }
private static void runOnStdIn(final JLanguageTool lt) throws IOException { final int MAX_FILE_SIZE = 64_000; InputStreamReader isr = null; BufferedReader br = null; StringBuilder sb = new StringBuilder(); try { isr = new InputStreamReader(new BufferedInputStream(System.in)); br = new BufferedReader(isr); String line; while ((line = br.readLine()) != null) { sb.append(line); sb.append('\n'); if (lt.getLanguage().getSentenceTokenizer().singleLineBreaksMarksPara()) { tagText(sb.toString(), lt); sb = new StringBuilder(); } else { if ("".equals(line) || sb.length() >= MAX_FILE_SIZE) { tagText(sb.toString(), lt); sb = new StringBuilder(); } } } } finally { if (sb.length() > 0) { tagText(sb.toString(), lt); } } br.close(); isr.close(); }
@Override protected void init() throws IOException { super.init(); final String langCountry; if (language.getCountries().length > 0) { langCountry = language.getShortName() + "_" + language.getCountries()[0]; } else { langCountry = language.getShortName(); } final String shortDicPath = "/" + language.getShortName() + "/hunspell/" + langCountry + ".dic"; String wordChars = ""; // set dictionary only if there are dictionary files: if (JLanguageTool.getDataBroker().resourceExists(shortDicPath)) { final String path = getDictionaryPath(langCountry, shortDicPath); if ("".equals(path)) { hunspellDict = null; } else { hunspellDict = Hunspell.getInstance().getDictionary(path); if (!"".equals(hunspellDict.getWordChars())) { wordChars = "(?![" + hunspellDict.getWordChars().replace("-", "\\-") + "])"; } addIgnoreWords(); } } nonWordPattern = Pattern.compile(wordChars + NON_ALPHABETIC); needsInit = false; }
private void assertMatches(String input, int expectedMatches) throws IOException { List<RuleMatch> ruleMatches = langTool.check(input); assertEquals( "Expected " + expectedMatches + " matches, got: " + ruleMatches, expectedMatches, ruleMatches.size()); }
public ConfigurationDialog(Frame owner, boolean insideOOo, Configuration config) { this.owner = owner; this.insideOOo = insideOOo; this.original = config; this.config = original.copy(original); messages = JLanguageTool.getMessageBundle(); }
private List<MatchingSentence> findMatchingSentences( IndexSearcher indexSearcher, TopDocs topDocs, JLanguageTool languageTool) throws IOException { final List<MatchingSentence> matchingSentences = new ArrayList<>(); for (ScoreDoc match : topDocs.scoreDocs) { final Document doc = indexSearcher.doc(match.doc); final String sentence = doc.get(FIELD_NAME); final List<RuleMatch> ruleMatches = languageTool.check(sentence); if (ruleMatches.size() > 0) { final String source = doc.get(SOURCE_FIELD_NAME); final AnalyzedSentence analyzedSentence = languageTool.getAnalyzedSentence(sentence); final MatchingSentence matchingSentence = new MatchingSentence(sentence, source, analyzedSentence, ruleMatches); matchingSentences.add(matchingSentence); } } return matchingSentences; }
@Deprecated public ConfigurationDialog(Frame owner, boolean insideOOo) { this.owner = owner; this.insideOOo = insideOOo; this.original = null; this.config = new Configuration(); messages = JLanguageTool.getMessageBundle(); }
private void assertBad(String s, String... expectedSuggestions) throws IOException { RuleMatch[] matches = rule.match(langTool.getAnalyzedSentence(s)); assertEquals("Did not find one match in sentence '" + s + "'", 1, matches.length); if (expectedSuggestions.length > 0) { RuleMatch match = matches[0]; List<String> suggestions = match.getSuggestedReplacements(); assertThat(suggestions, is(Arrays.asList(expectedSuggestions))); } }
/** * Creates a speller with the given maximum edit distance. * * @param filename path in classpath to morfologik dictionary * @param conversionLocale used when transforming the word to lowercase */ public MorfologikSpeller(String filename, Locale conversionLocale, int maxEditDistance) throws IOException { if (maxEditDistance <= 0) { throw new RuntimeException("maxEditDistance must be > 0: " + maxEditDistance); } final URL url = JLanguageTool.getDataBroker().getFromResourceDirAsUrl(filename); dictionary = Dictionary.read(url); speller = new Speller(dictionary, maxEditDistance); this.conversionLocale = conversionLocale != null ? conversionLocale : Locale.getDefault(); }
private void addIgnoreWords() throws IOException { hunspellDict.addWord(SpellingCheckRule.LANGUAGETOOL); hunspellDict.addWord(SpellingCheckRule.LANGUAGETOOL_FX); URL ignoreUrl = JLanguageTool.getDataBroker().getFromResourceDirAsUrl(getIgnoreFileName()); List<String> ignoreLines = Resources.readLines(ignoreUrl, Charsets.UTF_8); for (String ignoreLine : ignoreLines) { if (!ignoreLine.startsWith("#")) { hunspellDict.addWord(ignoreLine); } } }
private void initializeIfRequired() throws IOException { // Lazy initialize fields when needed and only once. if (manualTagger == null) { synchronized (this) { if (manualTagger == null) { manualTagger = new ManualTagger( JLanguageTool.getDataBroker().getFromResourceDirAsStream(USER_DICT_FILENAME)); } } } }
@Nullable private static MorfologikMultiSpeller getSpeller(Language language) { if (!language.getShortName().equals(Locale.GERMAN.getLanguage())) { throw new RuntimeException("Language is not a variant of German: " + language); } try { String morfoFile = "/de/hunspell/de_" + language.getCountries()[0] + ".dict"; if (JLanguageTool.getDataBroker().resourceExists(morfoFile)) { // spell data will not exist in LibreOffice/OpenOffice context try (InputStream stream = JLanguageTool.getDataBroker() .getFromResourceDirAsStream("/de/hunspell/spelling.txt"); BufferedReader br = new BufferedReader(new InputStreamReader(stream, "utf-8"))) { return new MorfologikMultiSpeller(morfoFile, new ExpandingReader(br), MAX_EDIT_DISTANCE); } } else { return null; } } catch (IOException e) { throw new RuntimeException("Could not set up morfologik spell checker", e); } }
private void assertBad(String s, int n, String... expectedSuggestions) throws IOException { RuleMatch[] matches = rule.match(langTool.getAnalyzedSentence(s)); assertEquals("Did not find " + n + " match(es) in sentence '" + s + "'", n, matches.length); if (expectedSuggestions.length > 0) { RuleMatch match = matches[0]; // When two errors are reported by the rule (so TODO above), it might happen that the first // match does not have the suggestions, but the second one if (matches.length > 1 && match.getSuggestedReplacements().size() == 0) { match = matches[1]; } List<String> suggestions = match.getSuggestedReplacements(); assertThat(suggestions, is(Arrays.asList(expectedSuggestions))); } }
private void run(Language lang) throws IOException { File basePath = new File("/lt/git/languagetool/languagetool-language-modules"); if (!basePath.exists()) { throw new RuntimeException("basePath does not exist: " + basePath); } String langCode = lang.getShortName(); File xml = new File( basePath, "/" + langCode + "/src/main/resources/org/languagetool/rules/" + langCode + "/grammar.xml"); List<String> xmlLines = IOUtils.readLines(new FileReader(xml)); JLanguageTool tool = new JLanguageTool(lang); int totalRules = 0; for (Rule rule : tool.getAllActiveRules()) { if (!(rule instanceof PatternRule)) { continue; } PatternRule patternRule = (PatternRule) rule; String id = patternRule.getFullId(); if (isSimple((PatternRule) rule)) { System.err.println("Simplifying: " + id); simplify(patternRule, xmlLines); } else { System.err.println("Can't simplify: " + id); } totalRules++; } System.err.println("touchedRulesCount: " + touchedRulesCount + " out of " + totalRules); for (String xmlLine : xmlLines) { System.out.println(xmlLine); } }
private String getDictionaryPath(final String dicName, final String originalPath) throws IOException { final URL dictURL = JLanguageTool.getDataBroker().getFromResourceDirAsUrl(originalPath); String dictionaryPath; // in the webstart, java EE or OSGi bundle version, we need to copy the files outside the jar // to the local temporary directory if ("jar".equals(dictURL.getProtocol()) || "vfs".equals(dictURL.getProtocol()) || "bundle".equals(dictURL.getProtocol())) { final File tempDir = new File(System.getProperty("java.io.tmpdir")); File tempDicFile = new File(tempDir, dicName + ".dic"); JLanguageTool.addTemporaryFile(tempDicFile); try (InputStream dicStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(originalPath)) { fileCopy(dicStream, tempDicFile); } File tempAffFile = new File(tempDir, dicName + ".aff"); JLanguageTool.addTemporaryFile(tempAffFile); try (InputStream affStream = JLanguageTool.getDataBroker() .getFromResourceDirAsStream(originalPath.replaceFirst(".dic$", ".aff"))) { fileCopy(affStream, tempAffFile); } dictionaryPath = tempDir.getAbsolutePath() + "/" + dicName; } else { final int suffixLength = ".dic".length(); try { dictionaryPath = new File(dictURL.toURI()).getAbsolutePath(); dictionaryPath = dictionaryPath.substring(0, dictionaryPath.length() - suffixLength); } catch (URISyntaxException e) { return ""; } } return dictionaryPath; }
public AbstractCompoundRule( final ResourceBundle messages, final String fileName, final String withHyphenMessage, final String withoutHyphenMessage, final String withOrWithoutHyphenMessage) throws IOException { if (messages != null) { super.setCategory(new Category(messages.getString("category_misc"))); } loadCompoundFile(JLanguageTool.getDataBroker().getFromResourceDirAsStream(fileName), "UTF-8"); this.withHyphenMessage = withHyphenMessage; this.withoutHyphenMessage = withoutHyphenMessage; this.withOrWithoutHyphenMessage = withOrWithoutHyphenMessage; setLocQualityIssueType(ITSIssueType.Misspelling); }
@Test public void testLoadWithStrictLimits() throws IOException { try (InputStream inputStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream("/yy/confusion_sets.txt")) { ConfusionSetLoader loader = new ConfusionSetLoader(); Map<String, List<ConfusionSet>> map = loader.loadConfusionSet(inputStream); assertThat(map.size(), is(10)); assertThat(map.get("there").size(), is(1)); assertThat(map.get("there").get(0).getFactor(), is(10L)); assertThat(map.get("their").size(), is(1)); assertThat(map.get("their").get(0).getFactor(), is(10L)); assertThat(map.get("foo").size(), is(2)); assertThat(map.get("foo").get(0).getFactor(), is(5L)); assertThat(map.get("foo").get(1).getFactor(), is(8L)); assertThat(map.get("goo").size(), is(2)); assertThat(map.get("goo").get(0).getFactor(), is(11L)); assertThat(map.get("goo").get(1).getFactor(), is(12L)); assertThat(map.get("lol").size(), is(1)); assertThat(map.get("something").size(), is(1)); assertThat(map.get("bar").size(), is(1)); assertThat(map.get("bar").get(0).getFactor(), is(5L)); Set<ConfusionString> there = map.get("there").get(0).getSet(); assertTrue(getAsString(there).contains("there - example 1")); assertTrue(getAsString(there).contains("their - example 2")); Set<ConfusionString> their = map.get("their").get(0).getSet(); assertTrue(getAsString(their).contains("there - example 1")); assertTrue(getAsString(their).contains("their - example 2")); assertFalse(getAsString(their).contains("comment")); Set<ConfusionString> foo = map.get("foo").get(0).getSet(); assertTrue(getAsString(foo).contains("foo")); Set<ConfusionString> bar = map.get("foo").get(0).getSet(); assertTrue(getAsString(bar).contains("bar")); Set<ConfusionString> baz = map.get("foo").get(1).getSet(); assertTrue(getAsString(baz).contains("baz")); } }