private String getMaintainers() { final TreeMap<String, Language> list = new TreeMap<>(); for (final Language lang : Language.REAL_LANGUAGES) { if (!lang.isVariant()) { if (lang.getMaintainers() != null) { list.put(messages.getString(lang.getShortName()), lang); } } } final StringBuilder maintainersInfo = new StringBuilder(); maintainersInfo.append("<table border=0 cellspacing=0 cellpadding=0>"); for (String lang : list.keySet()) { maintainersInfo.append("<tr valign=\"top\"><td>"); maintainersInfo.append(lang); maintainersInfo.append(":</td>"); maintainersInfo.append("<td> </td>"); maintainersInfo.append("<td>"); int i = 0; for (Contributor contributor : list.get(lang).getMaintainers()) { if (i > 0) { maintainersInfo.append(", "); if (i % 3 == 0) { maintainersInfo.append("<br>"); } } maintainersInfo.append(contributor.getName()); i++; } maintainersInfo.append("</td></tr>"); } maintainersInfo.append("</table>"); return maintainersInfo.toString(); }
public void testStartSymbolCountEqualsEndSymbolCount() throws IOException { for (Language language : Language.LANGUAGES) { final int startSymbols = language.getUnpairedRuleStartSymbols().length; final int endSymbols = language.getUnpairedRuleEndSymbols().length; assertEquals( "Different number of start and end symbols for " + language, startSymbols, endSymbols); } }
private Object[] getPossibleMotherTongues() { final List<Object> motherTongues = new ArrayList<>(); motherTongues.add(NO_MOTHER_TONGUE); for (final Language lang : Language.REAL_LANGUAGES) { motherTongues.add(lang.getTranslatedName(messages)); } return motherTongues.toArray(); }
/** * Get the Language object for the given localized language name. * * @param languageName e.g. <code>English</code> or <code>German</code> (case is significant) * @return a Language object or <code>null</code> */ private Language getLanguageForLocalizedName(final String languageName) { for (final Language element : Language.REAL_LANGUAGES) { if (languageName.equals(element.getTranslatedName(messages))) { return element; } } return null; }
public static void main(String[] args) throws Exception { ensureCorrectUsageOrExit(args); final long startTime = System.currentTimeMillis(); final String[] ruleIds = args[0].split(","); final File ruleFile = new File(args[1]); final String languageCode = args[2]; final Language language = Language.getLanguageForShortName(languageCode); final File indexDir = new File(args[3]); if (args.length > 4 && "--no_limit".equals(args[4])) { limitSearch = false; } final Searcher searcher = new Searcher(new SimpleFSDirectory(indexDir)); if (!limitSearch) { searcher.setMaxHits(100000); } for (String ruleId : ruleIds) { final long ruleStartTime = System.currentTimeMillis(); for (PatternRule rule : searcher.getRuleById(ruleId, ruleFile)) { final SearcherResult searcherResult = searcher.findRuleMatchesOnIndex(rule, language); int i = 1; if (searcherResult.getMatchingSentences().size() == 0) { System.out.println("[no matches]"); } for (MatchingSentence ruleMatch : searcherResult.getMatchingSentences()) { System.out.println( i + ": " + ruleMatch.getSentence() + " (Source: " + ruleMatch.getSource() + ")"); i++; } System.out.println("Time: " + (System.currentTimeMillis() - ruleStartTime) + "ms"); System.out.println("=============================================================="); } } System.out.println("Total time: " + (System.currentTimeMillis() - startTime) + "ms"); }
private static Language getLanguageOrExit(final String lang) { Language language = null; boolean foundLanguage = false; final List<String> supportedLanguages = new ArrayList<>(); for (final Language tmpLang : Languages.get()) { supportedLanguages.add(tmpLang.getShortName()); if (lang.equals(tmpLang.getShortName())) { language = tmpLang; foundLanguage = true; break; } } if (!foundLanguage) { System.out.println( "Unknown language '" + lang + "'. Supported languages are: " + supportedLanguages); exitWithUsageMessage(); } return language; }
private String dutchSpecialCase( final String firstToken, final String secondToken, final AnalyzedTokenReadings[] tokens) { if (!language.getShortName().equals("nl")) { return null; } if (tokens.length >= 3 && firstToken.equals("'") && secondToken.matches("k|m|n|r|s|t")) { return tokens[3].getToken(); } return null; }
private String dutchSpecialCase( final String firstToken, final String secondToken, final AnalyzedTokenReadings[] tokens) { if (!language.getShortName().equals("nl")) { return null; } if (tokens.length >= 3 && firstToken.equals("'") && DUTCH_SPECIAL_CASE.matcher(secondToken).matches()) { return tokens[3].getToken(); } return null; }
@Nullable private static MorfologikMultiSpeller getSpeller(Language language) { if (!language.getShortName().equals(Locale.GERMAN.getLanguage())) { throw new RuntimeException("Language is not a variant of German: " + language); } try { String morfoFile = "/de/hunspell/de_" + language.getCountries()[0] + ".dict"; if (JLanguageTool.getDataBroker().resourceExists(morfoFile)) { // spell data will not exist in LibreOffice/OpenOffice context try (InputStream stream = JLanguageTool.getDataBroker() .getFromResourceDirAsStream("/de/hunspell/spelling.txt"); BufferedReader br = new BufferedReader(new InputStreamReader(stream, "utf-8"))) { return new MorfologikMultiSpeller(morfoFile, new ExpandingReader(br), MAX_EDIT_DISTANCE); } } else { return null; } } catch (IOException e) { throw new RuntimeException("Could not set up morfologik spell checker", e); } }
private Date getLatestDate(String dateField, Language language) { try { String sql = "SELECT " + dateField + " FROM feed_matches WHERE language_code = ? ORDER BY " + dateField + " DESC"; try (PreparedStatement prepSt = conn.prepareStatement(sql)) { prepSt.setString(1, language.getShortName()); ResultSet resultSet = prepSt.executeQuery(); if (resultSet.next() && resultSet.getTimestamp(dateField) != null) { return new Date(resultSet.getTimestamp(dateField).getTime()); } } } catch (Exception e) { throw new RuntimeException("Could not get latest " + dateField + " from database", e); } return new Date(0); }
private void run(Language lang) throws IOException { File basePath = new File("/lt/git/languagetool/languagetool-language-modules"); if (!basePath.exists()) { throw new RuntimeException("basePath does not exist: " + basePath); } String langCode = lang.getShortName(); File xml = new File( basePath, "/" + langCode + "/src/main/resources/org/languagetool/rules/" + langCode + "/grammar.xml"); List<String> xmlLines = IOUtils.readLines(new FileReader(xml)); JLanguageTool tool = new JLanguageTool(lang); int totalRules = 0; for (Rule rule : tool.getAllActiveRules()) { if (!(rule instanceof PatternRule)) { continue; } PatternRule patternRule = (PatternRule) rule; String id = patternRule.getFullId(); if (isSimple((PatternRule) rule)) { System.err.println("Simplifying: " + id); simplify(patternRule, xmlLines); } else { System.err.println("Can't simplify: " + id); } totalRules++; } System.err.println("touchedRulesCount: " + touchedRulesCount + " out of " + totalRules); for (String xmlLine : xmlLines) { System.out.println(xmlLine); } }
/** * Get the name of the spelling file, which lists words to be accepted and used for suggestions, * even when the spell checker would not accept them. * * @since 2.9 */ protected String getSpellingFileName() { return language.getShortName() + SPELLING_FILE; }
/** * Takes an XML file named <tt>rules-xx-language.xml</tt>, e.g. <tt>rules-de-German.xml</tt> and * builds a Language object for that language. */ private static Language makeLanguage(final File file, final boolean isAdditional) throws IllegalAccessException, InstantiationException { Objects.requireNonNull(file, "file cannot be null"); if (!file.getName().endsWith(".xml")) { throw new RuleFilenameException(file); } final String[] parts = file.getName().split("-"); final boolean startsWithRules = parts[0].equals("rules"); final boolean secondPartHasCorrectLength = parts.length == 3 && (parts[1].length() == "en".length() || parts[1].length() == "ast".length() || parts[1].length() == "en_US".length()); if (!startsWithRules || !secondPartHasCorrectLength) { throw new RuleFilenameException(file); } // TODO: when the XML file is mergeable with // other rules (check this in the XML Rule Loader by using rules[@integrate='add']?), // subclass the existing language, // and adjust the settings if any are set in the rule file default configuration set Language newLanguage; if (Language.isLanguageSupported(parts[1])) { newLanguage = Language.getLanguageForShortName(parts[1]).getClass().newInstance(); newLanguage.addExternalRuleFile(file.getAbsolutePath()); newLanguage.setName(parts[2].replace(".xml", "")); newLanguage.makeExternal(); } else { newLanguage = new Language() { @Override public Locale getLocale() { return new Locale(getShortName()); } @Override public Contributor[] getMaintainers() { return null; } @Override public String getShortName() { if (parts[1].length() == 2) { return parts[1]; } return parts[1].split("_")[0]; // en as in en_US } @Override public String[] getCountries() { if (parts[1].length() == 2) { return new String[] {""}; } return new String[] {parts[1].split("_")[1]}; // US as in en_US } @Override public String getName() { return parts[2].replace(".xml", ""); } @Override public void setName(final String name) { // cannot be changed for this language } @Override public List<Class<? extends Rule>> getRelevantRules() { return Collections.emptyList(); } @Override public List<String> getRuleFileNames() { final List<String> ruleFiles = new ArrayList<>(); ruleFiles.add(file.getAbsolutePath()); return ruleFiles; } @Override public boolean isExternal() { return isAdditional; } }; } return newLanguage; }
public static void main(String[] args) throws IOException, InterruptedException { if (args.length != 2 && args.length != 3) { System.out.println( "Usage: " + AtomFeedCheckerCmd.class.getSimpleName() + " <atomFeedUrl> <sleepTime> [database.properties]"); System.out.println(" <atomFeedUrl> is a Wikipedia URL to the latest changes, for example:"); System.out.println( " https://de.wikipedia.org/w/index.php?title=Spezial:Letzte_%C3%84nderungen&feed=atom&namespace=0"); System.out.println( " <sleepTime> -1: don't loop at all (run once), 0: run in loop, other number: run in loop and"); System.out.println(" wait this many milliseconds between runs"); System.out.println( " [database.properties] (optional) is a file that defines dbUrl, dbUser, and dbPassword,"); System.out.println(" used to write the results to a database via JDBC"); System.out.println(""); System.out.println( " When the database.properties file is specified, this command will store all feed changes that"); System.out.println( " cause LanguageTool rule matches to the database. If an error is then fixed later, this will"); System.out.println( " usually also be detected and the rule match in the database will be marked as fixed. One case"); System.out.println( " where this does not work is if the context of the error gets modified before the error is fixed."); System.out.println(""); System.out.println( " Run this command regularly so that you don't miss any changes from the feed."); System.out.println( " As the feed may contain only the latest 50 changes, running it more often than"); System.out.println(" once per minute may be needed for active Wikipedias."); System.exit(1); } String url = args[0]; String langCode = url.substring(url.indexOf("//") + 2, url.indexOf(".")); System.out.println("Using URL: " + url); System.out.println("Language code: " + langCode); int sleepTimeMillis = Integer.parseInt(args[1]); System.out.println("Sleep time: " + sleepTimeMillis + "ms (-1 = don't loop)"); DatabaseConfig databaseConfig = null; if (args.length == 3) { String propFile = args[2]; databaseConfig = new DatabaseConfig(propFile); System.out.println("Writing results to database at: " + databaseConfig.getUrl()); } Language language = Language.getLanguageForShortName(langCode); AtomFeedChecker atomFeedChecker = new AtomFeedChecker(language, databaseConfig); while (true) { long startTime = System.currentTimeMillis(); try { atomFeedChecker.runCheck(url); System.out.println("Run time: " + (System.currentTimeMillis() - startTime) + "ms"); if (sleepTimeMillis == -1) { // don't loop at all break; } else { System.out.println("Sleeping " + sleepTimeMillis + "ms..."); Thread.sleep(sleepTimeMillis); } } catch (Exception e) { // e.g. 50x HTTP errors, network problems e.printStackTrace(); System.out.println("Sleeping " + sleepTimeMillis + "ms..."); Thread.sleep(sleepTimeMillis); } } }
protected void finalizeTokens() { if (!exceptionSet || tokenElement == null) { tokenElement = new Element( StringTools.trimWhitespace(elements.toString()), caseSensitive, regExpression, tokenInflected); tokenElement.setNegation(tokenNegated); } else { tokenElement.setStringElement(StringTools.trimWhitespace(elements.toString())); } if (skipPos != 0) { tokenElement.setSkipNext(skipPos); skipPos = 0; } if (minOccurrence != 1) { tokenElement.setMinOccurrence(minOccurrence); minOccurrence = 1; } if (maxOccurrence != 1) { tokenElement.setMaxOccurrence(maxOccurrence); maxOccurrence = 1; } if (posToken != null) { tokenElement.setPosElement(posToken, posRegExp, posNegation); posToken = null; } if (chunkTag != null) { tokenElement.setChunkElement(chunkTag); chunkTag = null; } if (tokenReference != null) { tokenElement.setMatch(tokenReference); } if (inAndGroup && andGroupCounter > 0) { elementList.get(elementList.size() - 1).setAndGroupElement(tokenElement); } else if (inOrGroup && orGroupCounter > 0) { elementList.get(elementList.size() - 1).setOrGroupElement(tokenElement); } else { elementList.add(tokenElement); } if (inAndGroup) { andGroupCounter++; } if (inOrGroup) { orGroupCounter++; } if (inUnification) { tokenElement.setUnification(equivalenceFeatures); } tokenElement.setInsideMarker(inMarker); if (inUnificationDef) { language.getUnifierConfiguration().setEquivalence(uFeature, uType, tokenElement); elementList.clear(); } if (tokenSpaceBeforeSet) { tokenElement.setWhitespaceBefore(tokenSpaceBefore); } resetToken(); }
/** * Get the name of the ignore file, which lists words to be accepted, even when the spell checker * would not accept them. Unlike with {@link #getSpellingFileName()} the words in this file will * not be used for creating suggestions for misspelled words. * * @since 2.7 */ protected String getIgnoreFileName() { return language.getShortName() + SPELLING_IGNORE_FILE; }
private boolean isIgnoredNoCase(String word) { return wordsToBeIgnored.contains(word) || (convertsCase && wordsToBeIgnored.contains(word.toLowerCase(language.getLocale()))); }
/** * Whether this rule can be used for text in the given language. Note that this just checks if * this rule is in the list of hard-coded rules for the given language, thus is will never return * {@code true} for {@link org.languagetool.rules.patterns.PatternRule}s. */ public final boolean supportsLanguage(final Language language) { final List<Class<? extends Rule>> relevantRuleClasses = language.getRelevantRules(); return relevantRuleClasses != null && relevantRuleClasses.contains(this.getClass()); }
@Override public final RuleMatch[] match(final AnalyzedSentence text) { final List<RuleMatch> ruleMatches = new ArrayList<RuleMatch>(); final AnalyzedTokenReadings[] tokens = text.getTokensWithoutWhitespace(); if (tokens.length < 2) { return toRuleMatchArray(ruleMatches); } int matchTokenPos = 1; // 0 = SENT_START final String firstToken = tokens[matchTokenPos].getToken(); String secondToken = null; String thirdToken = null; // ignore quote characters: if (tokens.length >= 3 && ("'".equals(firstToken) || "\"".equals(firstToken) || "„".equals(firstToken))) { matchTokenPos = 2; secondToken = tokens[matchTokenPos].getToken(); } final String firstDutchToken = dutchSpecialCase(firstToken, secondToken, tokens); if (firstDutchToken != null) { thirdToken = firstDutchToken; matchTokenPos = 3; } String checkToken = firstToken; if (thirdToken != null) { checkToken = thirdToken; } else if (secondToken != null) { checkToken = secondToken; } String lastToken = tokens[tokens.length - 1].getToken(); if (lastToken.matches("[ \"'„»«“]") && tokens.length >= 2) { // ignore trailing whitespace or quote lastToken = tokens[tokens.length - 2].getToken(); } boolean preventError = false; // TODO: why do only *these* languages have that special case? final String langCode = language.getShortName(); final boolean languageHasSpecialCases = langCode.equals("ru") || langCode.equals("pl") || langCode.equals("uk") || langCode.equals("be") || langCode.equals(Locale.ENGLISH.getLanguage()) || langCode.equals(Locale.ITALIAN.getLanguage()) || langCode.equals(Locale.GERMAN.getLanguage()); if (languageHasSpecialCases) { // fix for lists; note - this will not always work for the last point in OOo, // as OOo might serve paragraphs in any order. if (";".equals(lastParagraphString) || ";".equals(lastToken) || ",".equals(lastParagraphString) || ",".equals(lastToken)) { preventError = true; } // fix for words in table (not sentences); note - this will not always work for the last point // in OOo, // as OOo might serve paragraphs in any order. if (!lastToken.matches("[.?!…]")) { preventError = true; } } lastParagraphString = lastToken; if (checkToken.length() > 0) { final char firstChar = checkToken.charAt(0); if (!preventError && Character.isLowerCase(firstChar)) { final RuleMatch ruleMatch = new RuleMatch( this, tokens[matchTokenPos].getStartPos(), tokens[matchTokenPos].getStartPos() + tokens[matchTokenPos].getToken().length(), messages.getString("incorrect_case")); ruleMatch.setSuggestedReplacement(StringTools.uppercaseFirstChar(checkToken)); ruleMatches.add(ruleMatch); } } return toRuleMatchArray(ruleMatches); }
/** * Get the name of the prohibit file, which lists words not to be accepted, even when the spell * checker would accept them. * * @since 2.8 */ protected String getProhibitFileName() { return language.getShortName() + SPELLING_PROHIBIT_FILE; }