Ejemplo n.º 1
0
  /**
   * Tokenize some text - not thread safe.
   *
   * @param text tokenize this
   * @return tokenized text
   */
  public List<List<Token>> process(final String text) {
    final List<List<Token>> paragraph = new ArrayList<>();
    currentSentence = new ArrayList<>();
    final Tokens tokens = splitText(text);

    while (tokens.hasNext()) {
      final Token t = tokens.next();
      final String trimmedWord = t.text.trim();

      // skip spaces
      if (trimmedWord.isEmpty()) continue;

      if (((mode == WITH_PUNCTUATION)
          || (mode == WITHOUT_PUNCTUATION && isLetterOrDigit(initChar(t.text))))) {
        boolean canBreakSentence = true;
        if (t.text.contains("'")) {
          wordContainsApostrophe(t);
        } else if (".".equals(trimmedWord)) {
          canBreakSentence = wordIsFullStop(t);
        } else if (":".equals(trimmedWord)) {
          wordIsColon(tokens, t);
        } else currentSentence.add(t);

        // handling the end of a sentence
        if (canBreakSentence && equalss(trimmedWord, ".", ";", "?", "!")) {
          paragraph.add(currentSentence);
          currentSentence = new ArrayList<>();
        }
      }
    }

    if (!currentSentence.isEmpty()) paragraph.add(currentSentence);
    return paragraph;
  }
Ejemplo n.º 2
0
 private void wordIsColon(final Tokens tokens, final Token t) {
   // check we can get a previous and next word to merge together
   if (!currentSentence.isEmpty() && tokens.hasNext()) {
     // if the colon does not have a space on either side
     if (!isSpaceChar(lastChar(tokens.peekPrev().text))
         && !isSpaceChar(initChar(tokens.peekNext().text))) {
       // try to merge the 3 tokens back together again
       final int prevWordIndex = currentSentence.size() - 1;
       final Token prevSentenceWord = currentSentence.get(prevWordIndex);
       mergeWordsIntoSentence(prevSentenceWord, t, tokens.next(), prevWordIndex);
     } else currentSentence.add(t);
   } else currentSentence.add(t);
 }
  public void tokenize(SourceCode sourceCode, Tokens tokenEntries) {
    StringBuilder stringBuilder = sourceCode.getCodeBuffer();

    // Note that Java version is irrelevant for tokenizing
    LanguageVersionHandler languageVersionHandler =
        LanguageVersion.JAVA_14.getLanguageVersionHandler();
    String fileName = sourceCode.getFileName();
    TokenManager tokenMgr =
        languageVersionHandler
            .getParser(languageVersionHandler.getDefaultParserOptions())
            .getTokenManager(fileName, new StringReader(stringBuilder.toString()));
    Token currentToken = (Token) tokenMgr.getNextToken();

    TokenDiscarder discarder = new TokenDiscarder(ignoreAnnotations);

    while (currentToken.image.length() > 0) {
      discarder.updateState(currentToken);

      if (discarder.isDiscarding()) {
        currentToken = (Token) tokenMgr.getNextToken();
        continue;
      }

      processToken(tokenEntries, fileName, currentToken);
      currentToken = (Token) tokenMgr.getNextToken();
    }
    tokenEntries.add(TokenEntry.getEOF());
  }
Ejemplo n.º 4
0
 public ElspethTirelAbility2(GameState state) {
   super(state, -5, "Destroy all other permanents except for lands and tokens.");
   SetGenerator destroy =
       RelativeComplement.instance(
           Permanents.instance(),
           Union.instance(ABILITY_SOURCE_OF_THIS, LandPermanents.instance(), Tokens.instance()));
   this.addEffect(destroy(destroy, "Destroy all other permanents except for lands and tokens."));
 }
Ejemplo n.º 5
0
  @Test
  public void sentencesTokensTest() {
    Preparator p = new Preparator();
    PreparedInput input =
        p.prepare(
            "temp",
            "This is a sentence. And another one. Just for the kicks.",
            new StanfordManualPreparationSettings());

    Tokens ts = input.getTokens();
    List<List<Token>> sentences = ts.getSentenceTokens();
    assertEquals(3, sentences.size());
    assertEquals("This", sentences.get(0).get(0).getOriginal());
    assertEquals("sentence", sentences.get(0).get(3).getOriginal());
    assertEquals(".", sentences.get(0).get(4).getOriginal());
    assertEquals("another", sentences.get(1).get(1).getOriginal());
    assertEquals("kicks", sentences.get(2).get(3).getOriginal());
  }
Ejemplo n.º 6
0
  void readThis(int tokenId) {

    if (token.tokenType != tokenId) {
      String required = Tokens.getKeyword(tokenId);

      throw unexpectedTokenRequire(required);
    }

    read();
  }
 private void processToken(Tokens tokenEntries, String fileName, Token currentToken) {
   String image = currentToken.image;
   if (ignoreLiterals
       && (currentToken.kind == JavaParserConstants.STRING_LITERAL
           || currentToken.kind == JavaParserConstants.CHARACTER_LITERAL
           || currentToken.kind == JavaParserConstants.DECIMAL_LITERAL
           || currentToken.kind == JavaParserConstants.FLOATING_POINT_LITERAL)) {
     image = String.valueOf(currentToken.kind);
   }
   if (ignoreIdentifiers && currentToken.kind == JavaParserConstants.IDENTIFIER) {
     image = String.valueOf(currentToken.kind);
   }
   tokenEntries.add(new TokenEntry(image, fileName, currentToken.beginLine));
 }
Ejemplo n.º 8
0
  public static Expr parse(Tokens tokens) {
    int pos = tokens.getPosition();
    Expr firstExpr = MultiplyExpr.parse(tokens);

    if (firstExpr == null) {
      tokens.setPosition(pos);
      return null;
    }

    List<Expr> multiplyExprs = new LinkedList<Expr>();
    multiplyExprs.add(firstExpr);

    BitSet operators = new BitSet();

    for (int i = 0; tokens.hasNext(); i++) {
      char operator = tokens.nextChar();

      if (operator == '+' || operator == '-') {
        Expr nextExpr = MultiplyExpr.parse(tokens);

        if (nextExpr == null) {
          throw new QuerySyntaxException(tokens);
        }

        multiplyExprs.add(nextExpr);

        if (operator == '+') {
          operators.set(i);
        }
      } else {
        tokens.pushback();
        break;
      }
    }

    return multiplyExprs.size() == 1 ? firstExpr : new AdditiveExpr(multiplyExprs, operators);
  }
    /**
     * Returns true if the identifier consists of all uppercase letters digits and underscore,
     * beginning with a letter and is not in the keyword list.
     */
    static boolean isRegularIdentifier(String name) {

      for (int i = 0, length = name.length(); i < length; i++) {
        int c = name.charAt(i);

        if (c >= 'A' && c <= 'Z') {
          continue;
        } else if (c == '_' && i > 0) {
          continue;
        } else if (c >= '0' && c <= '9') {
          continue;
        }

        return false;
      }

      return !Tokens.isKeyword(name);
    }
Ejemplo n.º 10
0
 @SuppressWarnings("unchecked")
 static <T> Lexicon lexicon(
     Parser<String> wordScanner,
     String[] keywordNames,
     boolean caseSensitive,
     final Map<String, ?> defaultMap) {
   StringCase scase = getStringCase(caseSensitive);
   HashMap<String, Object> map = new HashMap<String, Object>();
   for (String n : unique(scase.comparator(), keywordNames)) {
     Object value = Tokens.reserved(n);
     map.put(scase.toKey(n), value);
   }
   final Map<String, Object> fmap = scase.toMap(map);
   Map<String, Object> tokenizerMap =
       new Map<String, Object>() {
         public Object map(String text) {
           Object val = fmap.map(text);
           if (val != null) return val;
           else return defaultMap.map(text);
         }
       };
   return new Lexicon(fmap, wordScanner.map(tokenizerMap));
 }
Ejemplo n.º 11
0
 /**
  * This method adds the specified token to the token list. By default, this method allows all
  * tokens. However, subclasses of the XPathExprScanner can override this method in order to
  * disallow certain tokens from being used in the scanned XPath expression. This is a convenient
  * way of allowing only a subset of XPath.
  */
 protected void addToken(Tokens tokens, int token) throws XNIException {
   tokens.addToken(token);
 } // addToken(int)
Ejemplo n.º 12
0
    /** Scans the XPointer Expression */
    private boolean scanExpr(
        SymbolTable symbolTable, Tokens tokens, String data, int currentOffset, int endOffset)
        throws XNIException {

      int ch;
      int openParen = 0;
      int closeParen = 0;
      int nameOffset, dataOffset;
      boolean isQName = false;
      String name = null;
      String prefix = null;
      String schemeData = null;
      StringBuffer schemeDataBuff = new StringBuffer();

      while (true) {

        if (currentOffset == endOffset) {
          break;
        }
        ch = data.charAt(currentOffset);

        //
        while (ch == ' ' || ch == 0x0A || ch == 0x09 || ch == 0x0D) {
          if (++currentOffset == endOffset) {
            break;
          }
          ch = data.charAt(currentOffset);
        }
        if (currentOffset == endOffset) {
          break;
        }

        //
        // [1]    Pointer      ::=    Shorthand | SchemeBased
        // [2]    Shorthand    ::=    NCName
        // [3]    SchemeBased  ::=    PointerPart (S? PointerPart)*
        // [4]    PointerPart  ::=    SchemeName '(' SchemeData ')'
        // [5]    SchemeName   ::=    QName
        // [6]    SchemeData   ::=    EscapedData*
        // [7]    EscapedData  ::=    NormalChar | '^(' | '^)' | '^^' | '(' SchemeData ')'
        // [8]    NormalChar   ::=    UnicodeChar - [()^]
        // [9]    UnicodeChar  ::=    [#x0-#x10FFFF]
        // [?]    QName        ::=    (NCName ':')? NCName
        // [?]    NCName       ::=    (Letter | '_') (NCNameChar)*
        // [?]    NCNameChar   ::=    Letter | Digit | '.' | '-' | '_'  (ascii subset of
        // 'NCNameChar')
        // [?]    Letter       ::=    [A-Za-z]                              (ascii subset of
        // 'Letter')
        // [?]    Digit        ::=    [0-9]                                  (ascii subset of
        // 'Digit')
        //
        byte chartype = (ch >= 0x80) ? CHARTYPE_NONASCII : fASCIICharMap[ch];

        switch (chartype) {
          case CHARTYPE_OPEN_PAREN: // '('
            addToken(tokens, Tokens.XPTRTOKEN_OPEN_PAREN);
            openParen++;
            ++currentOffset;
            break;

          case CHARTYPE_CLOSE_PAREN: // ')'
            addToken(tokens, Tokens.XPTRTOKEN_CLOSE_PAREN);
            closeParen++;
            ++currentOffset;
            break;

          case CHARTYPE_CARRET:
          case CHARTYPE_COLON:
          case CHARTYPE_DIGIT:
          case CHARTYPE_EQUAL:
          case CHARTYPE_LETTER:
          case CHARTYPE_MINUS:
          case CHARTYPE_NONASCII:
          case CHARTYPE_OTHER:
          case CHARTYPE_PERIOD:
          case CHARTYPE_SLASH:
          case CHARTYPE_UNDERSCORE:
          case CHARTYPE_WHITESPACE:
            // Scanning SchemeName | Shorthand
            if (openParen == 0) {
              nameOffset = currentOffset;
              currentOffset = scanNCName(data, endOffset, currentOffset);

              if (currentOffset == nameOffset) {
                reportError("InvalidShortHandPointer", new Object[] {data});
                return false;
              }

              if (currentOffset < endOffset) {
                ch = data.charAt(currentOffset);
              } else {
                ch = -1;
              }

              name = symbolTable.addSymbol(data.substring(nameOffset, currentOffset));
              prefix = XMLSymbols.EMPTY_STRING;

              // The name is a QName => a SchemeName
              if (ch == ':') {
                if (++currentOffset == endOffset) {
                  return false;
                }

                ch = data.charAt(currentOffset);
                prefix = name;
                nameOffset = currentOffset;
                currentOffset = scanNCName(data, endOffset, currentOffset);

                if (currentOffset == nameOffset) {
                  return false;
                }

                if (currentOffset < endOffset) {
                  ch = data.charAt(currentOffset);
                } else {
                  ch = -1;
                }

                isQName = true;
                name = symbolTable.addSymbol(data.substring(nameOffset, currentOffset));
              }

              // REVISIT:
              if (currentOffset != endOffset) {
                addToken(tokens, Tokens.XPTRTOKEN_SCHEMENAME);
                tokens.addToken(prefix);
                tokens.addToken(name);
                isQName = false;
              } else if (currentOffset == endOffset) {
                // NCName => Shorthand
                addToken(tokens, Tokens.XPTRTOKEN_SHORTHAND);
                tokens.addToken(name);
                isQName = false;
              }

              // reset open/close paren for the next pointer part
              closeParen = 0;

              break;

            } else if (openParen > 0 && closeParen == 0 && name != null) {
              // Scanning SchemeData
              dataOffset = currentOffset;
              currentOffset = scanData(data, schemeDataBuff, endOffset, currentOffset);

              if (currentOffset == dataOffset) {
                reportError("InvalidSchemeDataInXPointer", new Object[] {data});
                return false;
              }

              if (currentOffset < endOffset) {
                ch = data.charAt(currentOffset);
              } else {
                ch = -1;
              }

              schemeData = symbolTable.addSymbol(schemeDataBuff.toString());
              addToken(tokens, Tokens.XPTRTOKEN_SCHEMEDATA);
              tokens.addToken(schemeData);

              // reset open/close paren for the next pointer part
              openParen = 0;
              schemeDataBuff.delete(0, schemeDataBuff.length());

            } else {
              // ex. schemeName()
              // Should we throw an exception with a more suitable message instead??
              return false;
            }
        }
      } // end while
      return true;
    }
Ejemplo n.º 13
0
  /**
   * Parses the XPointer framework expression and delegates scheme specific parsing.
   *
   * @see
   *     com.sun.org.apache.xerces.internal.xpointer.XPointerProcessor#parseXPointer(java.lang.String)
   */
  public void parseXPointer(String xpointer) throws XNIException {

    // Initialize
    init();

    // tokens
    final Tokens tokens = new Tokens(fSymbolTable);

    // scanner
    Scanner scanner =
        new Scanner(fSymbolTable) {
          protected void addToken(Tokens tokens, int token) throws XNIException {
            if (token == Tokens.XPTRTOKEN_OPEN_PAREN
                || token == Tokens.XPTRTOKEN_CLOSE_PAREN
                || token == Tokens.XPTRTOKEN_SCHEMENAME
                || token == Tokens.XPTRTOKEN_SCHEMEDATA
                || token == Tokens.XPTRTOKEN_SHORTHAND) {
              super.addToken(tokens, token);
              return;
            }
            reportError("InvalidXPointerToken", new Object[] {tokens.getTokenString(token)});
          }
        };

    // scan the XPointer expression
    int length = xpointer.length();
    boolean success = scanner.scanExpr(fSymbolTable, tokens, xpointer, 0, length);

    if (!success) reportError("InvalidXPointerExpression", new Object[] {xpointer});

    while (tokens.hasMore()) {
      int token = tokens.nextToken();

      switch (token) {
        case Tokens.XPTRTOKEN_SHORTHAND:
          {

            // The shortHand name
            token = tokens.nextToken();
            String shortHandPointerName = tokens.getTokenString(token);

            if (shortHandPointerName == null) {
              reportError("InvalidXPointerExpression", new Object[] {xpointer});
            }

            XPointerPart shortHandPointer = new ShortHandPointer(fSymbolTable);
            shortHandPointer.setSchemeName(shortHandPointerName);
            fXPointerParts.add(shortHandPointer);
            break;
          }
        case Tokens.XPTRTOKEN_SCHEMENAME:
          {

            // Retreive the local name and prefix to form the scheme name
            token = tokens.nextToken();
            String prefix = tokens.getTokenString(token);
            token = tokens.nextToken();
            String localName = tokens.getTokenString(token);

            String schemeName = prefix + localName;

            // The next character should be an open parenthesis
            int openParenCount = 0;
            int closeParenCount = 0;

            token = tokens.nextToken();
            String openParen = tokens.getTokenString(token);
            if (openParen != "XPTRTOKEN_OPEN_PAREN") {

              // can not have more than one ShortHand Pointer
              if (token == Tokens.XPTRTOKEN_SHORTHAND) {
                reportError("MultipleShortHandPointers", new Object[] {xpointer});
              } else {
                reportError("InvalidXPointerExpression", new Object[] {xpointer});
              }
            }
            openParenCount++;

            // followed by zero or more ( and  the schemeData
            String schemeData = null;
            while (tokens.hasMore()) {
              token = tokens.nextToken();
              schemeData = tokens.getTokenString(token);
              if (schemeData != "XPTRTOKEN_OPEN_PAREN") {
                break;
              }
              openParenCount++;
            }
            token = tokens.nextToken();
            schemeData = tokens.getTokenString(token);

            // followed by the same number of )
            token = tokens.nextToken();
            String closeParen = tokens.getTokenString(token);
            if (closeParen != "XPTRTOKEN_CLOSE_PAREN") {
              reportError("SchemeDataNotFollowedByCloseParenthesis", new Object[] {xpointer});
            }
            closeParenCount++;

            while (tokens.hasMore()) {
              if (tokens.getTokenString(tokens.peekToken()) != "XPTRTOKEN_OPEN_PAREN") {
                break;
              }
              closeParenCount++;
            }

            // check if the number of open parenthesis are equal to the number of close parenthesis
            if (openParenCount != closeParenCount) {
              reportError(
                  "UnbalancedParenthesisInXPointerExpression",
                  new Object[] {
                    xpointer, new Integer(openParenCount), new Integer(closeParenCount)
                  });
            }

            // Perform scheme specific parsing of the pointer part
            if (schemeName.equals(ELEMENT_SCHEME_NAME)) {
              XPointerPart elementSchemePointer =
                  new ElementSchemePointer(fSymbolTable, fErrorReporter);
              elementSchemePointer.setSchemeName(schemeName);
              elementSchemePointer.setSchemeData(schemeData);

              // If an exception occurs while parsing the element() scheme expression
              // ignore it and move on to the next pointer part
              try {
                elementSchemePointer.parseXPointer(schemeData);
                fXPointerParts.add(elementSchemePointer);
              } catch (XNIException e) {
                // Re-throw the XPointer element() scheme syntax error.
                throw new XNIException(e);
              }

            } else {
              // ????
              reportWarning("SchemeUnsupported", new Object[] {schemeName});
            }

            break;
          }
        default:
          reportError("InvalidXPointerExpression", new Object[] {xpointer});
      }
    }
  }
    /**
     * {@inheritDoc}
     *
     * @see org.modeshape.common.text.TokenStream.Tokenizer#tokenize(CharacterStream, Tokens)
     */
    @Override
    public void tokenize(CharacterStream input, Tokens tokens) throws ParsingException {
      int startIndex;
      int endIndex;
      while (input.hasNext()) {
        char c = input.next();
        switch (c) {
          case ' ':
          case '\t':
          case '\n':
          case '\r':
            // Just skip these whitespace characters ...
            break;
            // ==============================================================================================
            // DDL Comments token = "--"
            // ==============================================================================================
          case '-':
            {
              startIndex = input.index();
              Position startPosition = input.position(startIndex);
              if (input.isNext('-')) {
                // -- END OF LINE comment ...
                boolean foundLineTerminator = false;
                while (input.hasNext()) {
                  c = input.next();
                  if (c == '\n' || c == '\r') {
                    foundLineTerminator = true;
                    break;
                  }
                }
                endIndex = input.index(); // the token won't include the '\n' or '\r' character(s)
                if (!foundLineTerminator) ++endIndex; // must point beyond last char
                if (c == '\r' && input.isNext('\n')) input.next();

                // Check for PARSER_ID

                if (useComments) {
                  tokens.addToken(startPosition, startIndex, endIndex, COMMENT);
                }

              } else {
                // just a regular dash ...
                tokens.addToken(startPosition, startIndex, startIndex + 1, SYMBOL);
              }
              break;
            }
            // ==============================================================================================
          case '(':
          case ')':
          case '{':
          case '}':
          case '*':
          case ',':
          case ';':
          case '+':
          case '%':
          case '?':
          case '[':
          case ']':
          case '!':
          case '<':
          case '>':
          case '|':
          case '=':
          case ':':
            tokens.addToken(
                input.position(input.index()), input.index(), input.index() + 1, SYMBOL);
            break;
          case '.':
            tokens.addToken(
                input.position(input.index()), input.index(), input.index() + 1, DECIMAL);
            break;
          case '\"':
            startIndex = input.index();
            Position startingPosition = input.position(startIndex);
            boolean foundClosingQuote = false;
            while (input.hasNext()) {
              c = input.next();
              if ((c == '\\' || c == '"') && input.isNext('"')) {
                c = input.next(); // consume the ' character since it is escaped
              } else if (c == '"') {
                foundClosingQuote = true;
                break;
              }
            }
            if (!foundClosingQuote) {
              String msg =
                  CommonI18n.noMatchingDoubleQuoteFound.text(
                      startingPosition.getLine(), startingPosition.getColumn());
              throw new ParsingException(startingPosition, msg);
            }
            endIndex = input.index() + 1; // beyond last character read
            tokens.addToken(startingPosition, startIndex, endIndex, DOUBLE_QUOTED_STRING);
            break;
          case '\u2019': // '’':
          case '\'':
            char quoteChar = c;
            startIndex = input.index();
            startingPosition = input.position(startIndex);
            foundClosingQuote = false;
            while (input.hasNext()) {
              c = input.next();
              if ((c == '\\' || c == quoteChar) && input.isNext(quoteChar)) {
                c = input.next(); // consume the ' character since it is escaped
              } else if (c == quoteChar) {
                foundClosingQuote = true;
                break;
              }
            }
            if (!foundClosingQuote) {
              String msg =
                  CommonI18n.noMatchingSingleQuoteFound.text(
                      startingPosition.getLine(), startingPosition.getColumn());
              throw new ParsingException(startingPosition, msg);
            }
            endIndex = input.index() + 1; // beyond last character read
            tokens.addToken(startingPosition, startIndex, endIndex, SINGLE_QUOTED_STRING);
            break;
          case '/':
            startIndex = input.index();
            startingPosition = input.position(startIndex);
            if (input.isNext('/')) {
              // End-of-line comment ...
              boolean foundLineTerminator = false;
              while (input.hasNext()) {
                c = input.next();
                if (c == '\n' || c == '\r') {
                  foundLineTerminator = true;
                  break;
                }
              }
              endIndex = input.index(); // the token won't include the '\n' or '\r' character(s)
              if (!foundLineTerminator) ++endIndex; // must point beyond last char
              if (c == '\r' && input.isNext('\n')) input.next();
              if (useComments) {
                tokens.addToken(startingPosition, startIndex, endIndex, COMMENT);
              }

            } else if (input.isNext('*')) {
              // Multi-line comment ...
              while (input.hasNext() && !input.isNext('*', '/')) {
                c = input.next();
              }
              if (input.hasNext()) input.next(); // consume the '*'
              if (input.hasNext()) input.next(); // consume the '/'

              endIndex = input.index() + 1; // the token will include the '/' and '*' characters
              if (useComments) {
                tokens.addToken(startingPosition, startIndex, endIndex, COMMENT);
              }

            } else {
              // just a regular slash ...
              tokens.addToken(startingPosition, startIndex, startIndex + 1, SYMBOL);
            }
            break;
          default:
            startIndex = input.index();
            Position startPosition = input.position(startIndex);
            // Read until another whitespace/symbol/decimal/slash is found
            while (input.hasNext()
                && !(input.isNextWhitespace() || input.isNextAnyOf("/.-(){}*,;+%?[]!<>|=:"))) {
              c = input.next();
            }
            endIndex = input.index() + 1; // beyond last character that was included
            tokens.addToken(startPosition, startIndex, endIndex, WORD);
        }
      }
    }