/** * Tokenize some text - not thread safe. * * @param text tokenize this * @return tokenized text */ public List<List<Token>> process(final String text) { final List<List<Token>> paragraph = new ArrayList<>(); currentSentence = new ArrayList<>(); final Tokens tokens = splitText(text); while (tokens.hasNext()) { final Token t = tokens.next(); final String trimmedWord = t.text.trim(); // skip spaces if (trimmedWord.isEmpty()) continue; if (((mode == WITH_PUNCTUATION) || (mode == WITHOUT_PUNCTUATION && isLetterOrDigit(initChar(t.text))))) { boolean canBreakSentence = true; if (t.text.contains("'")) { wordContainsApostrophe(t); } else if (".".equals(trimmedWord)) { canBreakSentence = wordIsFullStop(t); } else if (":".equals(trimmedWord)) { wordIsColon(tokens, t); } else currentSentence.add(t); // handling the end of a sentence if (canBreakSentence && equalss(trimmedWord, ".", ";", "?", "!")) { paragraph.add(currentSentence); currentSentence = new ArrayList<>(); } } } if (!currentSentence.isEmpty()) paragraph.add(currentSentence); return paragraph; }
private void wordIsColon(final Tokens tokens, final Token t) { // check we can get a previous and next word to merge together if (!currentSentence.isEmpty() && tokens.hasNext()) { // if the colon does not have a space on either side if (!isSpaceChar(lastChar(tokens.peekPrev().text)) && !isSpaceChar(initChar(tokens.peekNext().text))) { // try to merge the 3 tokens back together again final int prevWordIndex = currentSentence.size() - 1; final Token prevSentenceWord = currentSentence.get(prevWordIndex); mergeWordsIntoSentence(prevSentenceWord, t, tokens.next(), prevWordIndex); } else currentSentence.add(t); } else currentSentence.add(t); }
public void tokenize(SourceCode sourceCode, Tokens tokenEntries) { StringBuilder stringBuilder = sourceCode.getCodeBuffer(); // Note that Java version is irrelevant for tokenizing LanguageVersionHandler languageVersionHandler = LanguageVersion.JAVA_14.getLanguageVersionHandler(); String fileName = sourceCode.getFileName(); TokenManager tokenMgr = languageVersionHandler .getParser(languageVersionHandler.getDefaultParserOptions()) .getTokenManager(fileName, new StringReader(stringBuilder.toString())); Token currentToken = (Token) tokenMgr.getNextToken(); TokenDiscarder discarder = new TokenDiscarder(ignoreAnnotations); while (currentToken.image.length() > 0) { discarder.updateState(currentToken); if (discarder.isDiscarding()) { currentToken = (Token) tokenMgr.getNextToken(); continue; } processToken(tokenEntries, fileName, currentToken); currentToken = (Token) tokenMgr.getNextToken(); } tokenEntries.add(TokenEntry.getEOF()); }
public ElspethTirelAbility2(GameState state) { super(state, -5, "Destroy all other permanents except for lands and tokens."); SetGenerator destroy = RelativeComplement.instance( Permanents.instance(), Union.instance(ABILITY_SOURCE_OF_THIS, LandPermanents.instance(), Tokens.instance())); this.addEffect(destroy(destroy, "Destroy all other permanents except for lands and tokens.")); }
@Test public void sentencesTokensTest() { Preparator p = new Preparator(); PreparedInput input = p.prepare( "temp", "This is a sentence. And another one. Just for the kicks.", new StanfordManualPreparationSettings()); Tokens ts = input.getTokens(); List<List<Token>> sentences = ts.getSentenceTokens(); assertEquals(3, sentences.size()); assertEquals("This", sentences.get(0).get(0).getOriginal()); assertEquals("sentence", sentences.get(0).get(3).getOriginal()); assertEquals(".", sentences.get(0).get(4).getOriginal()); assertEquals("another", sentences.get(1).get(1).getOriginal()); assertEquals("kicks", sentences.get(2).get(3).getOriginal()); }
void readThis(int tokenId) { if (token.tokenType != tokenId) { String required = Tokens.getKeyword(tokenId); throw unexpectedTokenRequire(required); } read(); }
private void processToken(Tokens tokenEntries, String fileName, Token currentToken) { String image = currentToken.image; if (ignoreLiterals && (currentToken.kind == JavaParserConstants.STRING_LITERAL || currentToken.kind == JavaParserConstants.CHARACTER_LITERAL || currentToken.kind == JavaParserConstants.DECIMAL_LITERAL || currentToken.kind == JavaParserConstants.FLOATING_POINT_LITERAL)) { image = String.valueOf(currentToken.kind); } if (ignoreIdentifiers && currentToken.kind == JavaParserConstants.IDENTIFIER) { image = String.valueOf(currentToken.kind); } tokenEntries.add(new TokenEntry(image, fileName, currentToken.beginLine)); }
public static Expr parse(Tokens tokens) { int pos = tokens.getPosition(); Expr firstExpr = MultiplyExpr.parse(tokens); if (firstExpr == null) { tokens.setPosition(pos); return null; } List<Expr> multiplyExprs = new LinkedList<Expr>(); multiplyExprs.add(firstExpr); BitSet operators = new BitSet(); for (int i = 0; tokens.hasNext(); i++) { char operator = tokens.nextChar(); if (operator == '+' || operator == '-') { Expr nextExpr = MultiplyExpr.parse(tokens); if (nextExpr == null) { throw new QuerySyntaxException(tokens); } multiplyExprs.add(nextExpr); if (operator == '+') { operators.set(i); } } else { tokens.pushback(); break; } } return multiplyExprs.size() == 1 ? firstExpr : new AdditiveExpr(multiplyExprs, operators); }
/** * Returns true if the identifier consists of all uppercase letters digits and underscore, * beginning with a letter and is not in the keyword list. */ static boolean isRegularIdentifier(String name) { for (int i = 0, length = name.length(); i < length; i++) { int c = name.charAt(i); if (c >= 'A' && c <= 'Z') { continue; } else if (c == '_' && i > 0) { continue; } else if (c >= '0' && c <= '9') { continue; } return false; } return !Tokens.isKeyword(name); }
@SuppressWarnings("unchecked") static <T> Lexicon lexicon( Parser<String> wordScanner, String[] keywordNames, boolean caseSensitive, final Map<String, ?> defaultMap) { StringCase scase = getStringCase(caseSensitive); HashMap<String, Object> map = new HashMap<String, Object>(); for (String n : unique(scase.comparator(), keywordNames)) { Object value = Tokens.reserved(n); map.put(scase.toKey(n), value); } final Map<String, Object> fmap = scase.toMap(map); Map<String, Object> tokenizerMap = new Map<String, Object>() { public Object map(String text) { Object val = fmap.map(text); if (val != null) return val; else return defaultMap.map(text); } }; return new Lexicon(fmap, wordScanner.map(tokenizerMap)); }
/** * This method adds the specified token to the token list. By default, this method allows all * tokens. However, subclasses of the XPathExprScanner can override this method in order to * disallow certain tokens from being used in the scanned XPath expression. This is a convenient * way of allowing only a subset of XPath. */ protected void addToken(Tokens tokens, int token) throws XNIException { tokens.addToken(token); } // addToken(int)
/** Scans the XPointer Expression */ private boolean scanExpr( SymbolTable symbolTable, Tokens tokens, String data, int currentOffset, int endOffset) throws XNIException { int ch; int openParen = 0; int closeParen = 0; int nameOffset, dataOffset; boolean isQName = false; String name = null; String prefix = null; String schemeData = null; StringBuffer schemeDataBuff = new StringBuffer(); while (true) { if (currentOffset == endOffset) { break; } ch = data.charAt(currentOffset); // while (ch == ' ' || ch == 0x0A || ch == 0x09 || ch == 0x0D) { if (++currentOffset == endOffset) { break; } ch = data.charAt(currentOffset); } if (currentOffset == endOffset) { break; } // // [1] Pointer ::= Shorthand | SchemeBased // [2] Shorthand ::= NCName // [3] SchemeBased ::= PointerPart (S? PointerPart)* // [4] PointerPart ::= SchemeName '(' SchemeData ')' // [5] SchemeName ::= QName // [6] SchemeData ::= EscapedData* // [7] EscapedData ::= NormalChar | '^(' | '^)' | '^^' | '(' SchemeData ')' // [8] NormalChar ::= UnicodeChar - [()^] // [9] UnicodeChar ::= [#x0-#x10FFFF] // [?] QName ::= (NCName ':')? NCName // [?] NCName ::= (Letter | '_') (NCNameChar)* // [?] NCNameChar ::= Letter | Digit | '.' | '-' | '_' (ascii subset of // 'NCNameChar') // [?] Letter ::= [A-Za-z] (ascii subset of // 'Letter') // [?] Digit ::= [0-9] (ascii subset of // 'Digit') // byte chartype = (ch >= 0x80) ? CHARTYPE_NONASCII : fASCIICharMap[ch]; switch (chartype) { case CHARTYPE_OPEN_PAREN: // '(' addToken(tokens, Tokens.XPTRTOKEN_OPEN_PAREN); openParen++; ++currentOffset; break; case CHARTYPE_CLOSE_PAREN: // ')' addToken(tokens, Tokens.XPTRTOKEN_CLOSE_PAREN); closeParen++; ++currentOffset; break; case CHARTYPE_CARRET: case CHARTYPE_COLON: case CHARTYPE_DIGIT: case CHARTYPE_EQUAL: case CHARTYPE_LETTER: case CHARTYPE_MINUS: case CHARTYPE_NONASCII: case CHARTYPE_OTHER: case CHARTYPE_PERIOD: case CHARTYPE_SLASH: case CHARTYPE_UNDERSCORE: case CHARTYPE_WHITESPACE: // Scanning SchemeName | Shorthand if (openParen == 0) { nameOffset = currentOffset; currentOffset = scanNCName(data, endOffset, currentOffset); if (currentOffset == nameOffset) { reportError("InvalidShortHandPointer", new Object[] {data}); return false; } if (currentOffset < endOffset) { ch = data.charAt(currentOffset); } else { ch = -1; } name = symbolTable.addSymbol(data.substring(nameOffset, currentOffset)); prefix = XMLSymbols.EMPTY_STRING; // The name is a QName => a SchemeName if (ch == ':') { if (++currentOffset == endOffset) { return false; } ch = data.charAt(currentOffset); prefix = name; nameOffset = currentOffset; currentOffset = scanNCName(data, endOffset, currentOffset); if (currentOffset == nameOffset) { return false; } if (currentOffset < endOffset) { ch = data.charAt(currentOffset); } else { ch = -1; } isQName = true; name = symbolTable.addSymbol(data.substring(nameOffset, currentOffset)); } // REVISIT: if (currentOffset != endOffset) { addToken(tokens, Tokens.XPTRTOKEN_SCHEMENAME); tokens.addToken(prefix); tokens.addToken(name); isQName = false; } else if (currentOffset == endOffset) { // NCName => Shorthand addToken(tokens, Tokens.XPTRTOKEN_SHORTHAND); tokens.addToken(name); isQName = false; } // reset open/close paren for the next pointer part closeParen = 0; break; } else if (openParen > 0 && closeParen == 0 && name != null) { // Scanning SchemeData dataOffset = currentOffset; currentOffset = scanData(data, schemeDataBuff, endOffset, currentOffset); if (currentOffset == dataOffset) { reportError("InvalidSchemeDataInXPointer", new Object[] {data}); return false; } if (currentOffset < endOffset) { ch = data.charAt(currentOffset); } else { ch = -1; } schemeData = symbolTable.addSymbol(schemeDataBuff.toString()); addToken(tokens, Tokens.XPTRTOKEN_SCHEMEDATA); tokens.addToken(schemeData); // reset open/close paren for the next pointer part openParen = 0; schemeDataBuff.delete(0, schemeDataBuff.length()); } else { // ex. schemeName() // Should we throw an exception with a more suitable message instead?? return false; } } } // end while return true; }
/** * Parses the XPointer framework expression and delegates scheme specific parsing. * * @see * com.sun.org.apache.xerces.internal.xpointer.XPointerProcessor#parseXPointer(java.lang.String) */ public void parseXPointer(String xpointer) throws XNIException { // Initialize init(); // tokens final Tokens tokens = new Tokens(fSymbolTable); // scanner Scanner scanner = new Scanner(fSymbolTable) { protected void addToken(Tokens tokens, int token) throws XNIException { if (token == Tokens.XPTRTOKEN_OPEN_PAREN || token == Tokens.XPTRTOKEN_CLOSE_PAREN || token == Tokens.XPTRTOKEN_SCHEMENAME || token == Tokens.XPTRTOKEN_SCHEMEDATA || token == Tokens.XPTRTOKEN_SHORTHAND) { super.addToken(tokens, token); return; } reportError("InvalidXPointerToken", new Object[] {tokens.getTokenString(token)}); } }; // scan the XPointer expression int length = xpointer.length(); boolean success = scanner.scanExpr(fSymbolTable, tokens, xpointer, 0, length); if (!success) reportError("InvalidXPointerExpression", new Object[] {xpointer}); while (tokens.hasMore()) { int token = tokens.nextToken(); switch (token) { case Tokens.XPTRTOKEN_SHORTHAND: { // The shortHand name token = tokens.nextToken(); String shortHandPointerName = tokens.getTokenString(token); if (shortHandPointerName == null) { reportError("InvalidXPointerExpression", new Object[] {xpointer}); } XPointerPart shortHandPointer = new ShortHandPointer(fSymbolTable); shortHandPointer.setSchemeName(shortHandPointerName); fXPointerParts.add(shortHandPointer); break; } case Tokens.XPTRTOKEN_SCHEMENAME: { // Retreive the local name and prefix to form the scheme name token = tokens.nextToken(); String prefix = tokens.getTokenString(token); token = tokens.nextToken(); String localName = tokens.getTokenString(token); String schemeName = prefix + localName; // The next character should be an open parenthesis int openParenCount = 0; int closeParenCount = 0; token = tokens.nextToken(); String openParen = tokens.getTokenString(token); if (openParen != "XPTRTOKEN_OPEN_PAREN") { // can not have more than one ShortHand Pointer if (token == Tokens.XPTRTOKEN_SHORTHAND) { reportError("MultipleShortHandPointers", new Object[] {xpointer}); } else { reportError("InvalidXPointerExpression", new Object[] {xpointer}); } } openParenCount++; // followed by zero or more ( and the schemeData String schemeData = null; while (tokens.hasMore()) { token = tokens.nextToken(); schemeData = tokens.getTokenString(token); if (schemeData != "XPTRTOKEN_OPEN_PAREN") { break; } openParenCount++; } token = tokens.nextToken(); schemeData = tokens.getTokenString(token); // followed by the same number of ) token = tokens.nextToken(); String closeParen = tokens.getTokenString(token); if (closeParen != "XPTRTOKEN_CLOSE_PAREN") { reportError("SchemeDataNotFollowedByCloseParenthesis", new Object[] {xpointer}); } closeParenCount++; while (tokens.hasMore()) { if (tokens.getTokenString(tokens.peekToken()) != "XPTRTOKEN_OPEN_PAREN") { break; } closeParenCount++; } // check if the number of open parenthesis are equal to the number of close parenthesis if (openParenCount != closeParenCount) { reportError( "UnbalancedParenthesisInXPointerExpression", new Object[] { xpointer, new Integer(openParenCount), new Integer(closeParenCount) }); } // Perform scheme specific parsing of the pointer part if (schemeName.equals(ELEMENT_SCHEME_NAME)) { XPointerPart elementSchemePointer = new ElementSchemePointer(fSymbolTable, fErrorReporter); elementSchemePointer.setSchemeName(schemeName); elementSchemePointer.setSchemeData(schemeData); // If an exception occurs while parsing the element() scheme expression // ignore it and move on to the next pointer part try { elementSchemePointer.parseXPointer(schemeData); fXPointerParts.add(elementSchemePointer); } catch (XNIException e) { // Re-throw the XPointer element() scheme syntax error. throw new XNIException(e); } } else { // ???? reportWarning("SchemeUnsupported", new Object[] {schemeName}); } break; } default: reportError("InvalidXPointerExpression", new Object[] {xpointer}); } } }
/** * {@inheritDoc} * * @see org.modeshape.common.text.TokenStream.Tokenizer#tokenize(CharacterStream, Tokens) */ @Override public void tokenize(CharacterStream input, Tokens tokens) throws ParsingException { int startIndex; int endIndex; while (input.hasNext()) { char c = input.next(); switch (c) { case ' ': case '\t': case '\n': case '\r': // Just skip these whitespace characters ... break; // ============================================================================================== // DDL Comments token = "--" // ============================================================================================== case '-': { startIndex = input.index(); Position startPosition = input.position(startIndex); if (input.isNext('-')) { // -- END OF LINE comment ... boolean foundLineTerminator = false; while (input.hasNext()) { c = input.next(); if (c == '\n' || c == '\r') { foundLineTerminator = true; break; } } endIndex = input.index(); // the token won't include the '\n' or '\r' character(s) if (!foundLineTerminator) ++endIndex; // must point beyond last char if (c == '\r' && input.isNext('\n')) input.next(); // Check for PARSER_ID if (useComments) { tokens.addToken(startPosition, startIndex, endIndex, COMMENT); } } else { // just a regular dash ... tokens.addToken(startPosition, startIndex, startIndex + 1, SYMBOL); } break; } // ============================================================================================== case '(': case ')': case '{': case '}': case '*': case ',': case ';': case '+': case '%': case '?': case '[': case ']': case '!': case '<': case '>': case '|': case '=': case ':': tokens.addToken( input.position(input.index()), input.index(), input.index() + 1, SYMBOL); break; case '.': tokens.addToken( input.position(input.index()), input.index(), input.index() + 1, DECIMAL); break; case '\"': startIndex = input.index(); Position startingPosition = input.position(startIndex); boolean foundClosingQuote = false; while (input.hasNext()) { c = input.next(); if ((c == '\\' || c == '"') && input.isNext('"')) { c = input.next(); // consume the ' character since it is escaped } else if (c == '"') { foundClosingQuote = true; break; } } if (!foundClosingQuote) { String msg = CommonI18n.noMatchingDoubleQuoteFound.text( startingPosition.getLine(), startingPosition.getColumn()); throw new ParsingException(startingPosition, msg); } endIndex = input.index() + 1; // beyond last character read tokens.addToken(startingPosition, startIndex, endIndex, DOUBLE_QUOTED_STRING); break; case '\u2019': // '’': case '\'': char quoteChar = c; startIndex = input.index(); startingPosition = input.position(startIndex); foundClosingQuote = false; while (input.hasNext()) { c = input.next(); if ((c == '\\' || c == quoteChar) && input.isNext(quoteChar)) { c = input.next(); // consume the ' character since it is escaped } else if (c == quoteChar) { foundClosingQuote = true; break; } } if (!foundClosingQuote) { String msg = CommonI18n.noMatchingSingleQuoteFound.text( startingPosition.getLine(), startingPosition.getColumn()); throw new ParsingException(startingPosition, msg); } endIndex = input.index() + 1; // beyond last character read tokens.addToken(startingPosition, startIndex, endIndex, SINGLE_QUOTED_STRING); break; case '/': startIndex = input.index(); startingPosition = input.position(startIndex); if (input.isNext('/')) { // End-of-line comment ... boolean foundLineTerminator = false; while (input.hasNext()) { c = input.next(); if (c == '\n' || c == '\r') { foundLineTerminator = true; break; } } endIndex = input.index(); // the token won't include the '\n' or '\r' character(s) if (!foundLineTerminator) ++endIndex; // must point beyond last char if (c == '\r' && input.isNext('\n')) input.next(); if (useComments) { tokens.addToken(startingPosition, startIndex, endIndex, COMMENT); } } else if (input.isNext('*')) { // Multi-line comment ... while (input.hasNext() && !input.isNext('*', '/')) { c = input.next(); } if (input.hasNext()) input.next(); // consume the '*' if (input.hasNext()) input.next(); // consume the '/' endIndex = input.index() + 1; // the token will include the '/' and '*' characters if (useComments) { tokens.addToken(startingPosition, startIndex, endIndex, COMMENT); } } else { // just a regular slash ... tokens.addToken(startingPosition, startIndex, startIndex + 1, SYMBOL); } break; default: startIndex = input.index(); Position startPosition = input.position(startIndex); // Read until another whitespace/symbol/decimal/slash is found while (input.hasNext() && !(input.isNextWhitespace() || input.isNextAnyOf("/.-(){}*,;+%?[]!<>|=:"))) { c = input.next(); } endIndex = input.index() + 1; // beyond last character that was included tokens.addToken(startPosition, startIndex, endIndex, WORD); } } }