/** * Static service to get a term from its string representation, providing a specific operator * manager */ public static Term parseSingleTerm(String st, OperatorManager op) throws InvalidTermException { try { Parser p = new Parser(op, st); Token t = p.tokenizer.readToken(); if (t.isEOF()) throw new InvalidTermException("Term starts with EOF"); p.tokenizer.unreadToken(t); Term term = p.expr(false); if (term == null) throw new InvalidTermException("Term is null"); if (!p.tokenizer.readToken().isEOF()) throw new InvalidTermException("The entire string could not be read as one term"); term.resolveTerm(); return term; } catch (IOException ex) { throw new InvalidTermException("An I/O error occured"); } }
/** * Parses and returns a valid 'leftside' of an expression. If the left side starts with a prefix, * it consumes other expressions with a lower priority than itself. If the left side does not have * a prefix it must be an expr0. * * @param commaIsEndMarker used when the leftside is part of and argument list of expressions * @param maxPriority operators with a higher priority than this will effectivly end the * expression * @return a wrapper of: 1. term correctly structured and 2. the priority of its root operator * @throws InvalidTermException */ private IdentifiedTerm parseLeftSide(boolean commaIsEndMarker, int maxPriority) throws InvalidTermException, IOException { // 1. prefix expression Token f = tokenizer.readToken(); if (f.isOperator(commaIsEndMarker)) { int FX = opManager.opPrio(f.seq, "fx"); int FY = opManager.opPrio(f.seq, "fy"); if (f.seq.equals("-")) { Token t = tokenizer.readToken(); if (t.isNumber()) /*Michele Castagna 06/2011*/ // return new IdentifiedTerm(0, Parser.createNumber("-" + t.seq)); return identifyTerm(0, Parser.createNumber("-" + t.seq), tokenStart); /**/ else tokenizer.unreadToken(t); } // check that no operator has a priority higher than permitted if (FY > maxPriority) FY = -1; if (FX > maxPriority) FX = -1; // FX has priority over FY boolean haveAttemptedFX = false; if (FX >= FY && FX >= OperatorManager.OP_LOW) { IdentifiedTerm found = exprA(FX - 1, commaIsEndMarker); // op(fx, n) exprA(n - 1) if (found != null) /*Castagna 06/2011*/ // return new IdentifiedTerm(FX, new Struct(f.seq, found.result)); return identifyTerm(FX, new Struct(f.seq, found.result), tokenStart); /**/ else haveAttemptedFX = true; } // FY has priority over FX, or FX has failed if (FY >= OperatorManager.OP_LOW) { IdentifiedTerm found = exprA(FY, commaIsEndMarker); // op(fy,n) exprA(1200) or op(fy,n) exprA(n) if (found != null) /*Castagna 06/2011*/ // return new IdentifiedTerm(FY, new Struct(f.seq, found.result)); return identifyTerm(FY, new Struct(f.seq, found.result), tokenStart); /**/ } // FY has priority over FX, but FY failed if (!haveAttemptedFX && FX >= OperatorManager.OP_LOW) { IdentifiedTerm found = exprA(FX - 1, commaIsEndMarker); // op(fx, n) exprA(n - 1) if (found != null) /*Castagna 06/2011*/ // return new IdentifiedTerm(FX, new Struct(f.seq, found.result)); return identifyTerm(FX, new Struct(f.seq, found.result), tokenStart); /**/ } } tokenizer.unreadToken(f); // 2. expr0 return new IdentifiedTerm(0, expr0()); }
Token readNextToken() throws IOException, InvalidTermException { int typea; String svala; if (pushBack2 != null) { typea = pushBack2.typea; svala = pushBack2.svala; pushBack2 = null; } else { typea = super.nextToken(); svala = sval; } // skips whitespace // could be simplified if lookahead for blank space in functors wasn't necessary // and if '.' in numbers could be written with blank space while (Tokenizer.isWhite(typea)) { typea = super.nextToken(); svala = sval; } // skips single line comments // could be simplified if % was not a legal character in quotes if (typea == '%') { do { typea = super.nextToken(); } while (typea != '\r' && typea != '\n' && typea != TT_EOF); pushBack(); // pushes back \r or \n. These are whitespace, so when readNextToken() finds them, // they are marked as whitespace return readNextToken(); } // skips /* comments */ if (typea == '/') { int typeb = super.nextToken(); if (typeb == '*') { do { typea = typeb; typeb = super.nextToken(); } while (typea != '*' || typeb != '/'); return readNextToken(); } else { pushBack(); } } // syntactic charachters if (typea == TT_EOF) return new Token("", Tokenizer.EOF); if (typea == '(') return new Token("(", Tokenizer.LPAR); if (typea == ')') return new Token(")", Tokenizer.RPAR); if (typea == '{') return new Token("{", Tokenizer.LBRA2); if (typea == '}') return new Token("}", Tokenizer.RBRA2); if (typea == '[') return new Token("[", Tokenizer.LBRA); if (typea == ']') return new Token("]", Tokenizer.RBRA); if (typea == '|') return new Token("|", Tokenizer.BAR); if (typea == '!') return new Token("!", Tokenizer.ATOM); if (typea == ',') return new Token(",", Tokenizer.OPERATOR); if (typea == '.') { // check that '.' as end token is followed by a layout character, see ISO Standard // 6.4.8 endnote int typeb = super.nextToken(); if (Tokenizer.isWhite(typeb) || typeb == '%' || typeb == StreamTokenizer.TT_EOF) return new Token(".", Tokenizer.END); else pushBack(); } boolean isNumber = false; // variable, atom or number if (typea == TT_WORD) { char firstChar = svala.charAt(0); // variable if (Character.isUpperCase(firstChar) || '_' == firstChar) return new Token(svala, Tokenizer.VARIABLE); else if (firstChar >= '0' && firstChar <= '9') // all words starting with 0 or 9 must be a number isNumber = true; // set type to number and handle later else { // otherwise, it must be an atom (or wrong) int typeb = super.nextToken(); // lookahead 1 to identify what type of atom pushBack(); // this does not skip whitespaces, only readNext does so. if (typeb == '(') return new Token(svala, Tokenizer.ATOM | Tokenizer.FUNCTOR); if (Tokenizer.isWhite(typeb)) return new Token(svala, Tokenizer.ATOM | Tokenizer.OPERATOR); return new Token(svala, Tokenizer.ATOM); } } // quotes if (typea == '\'' || typea == '\"' || typea == '`') { int qType = typea; StringBuffer quote = new StringBuffer(); while (true) { // run through entire quote and added body to quote buffer typea = super.nextToken(); svala = sval; // continuation escape sequence if (typea == '\\') { int typeb = super.nextToken(); if (typeb == '\n') // continuation escape sequence marker \\n continue; if (typeb == '\r') { int typec = super.nextToken(); if (typec == '\n') continue; // continuation escape sequence marker \\r\n pushBack(); continue; // continuation escape sequence marker \\r } pushBack(); // pushback typeb } // double '' or "" or `` if (typea == qType) { int typeb = super.nextToken(); if (typeb == qType) { // escaped '' or "" or `` quote.append((char) qType); continue; } else { pushBack(); break; // otherwise, break on single quote } } if (typea == '\n' || typea == '\r') throw new InvalidTermException( "line break in quote not allowed (unless they are escaped \\ first)"); if (svala != null) quote.append(svala); else quote.append((char) typea); } String quoteBody = quote.toString(); qType = qType == '\'' ? SQ_SEQUENCE : qType == '\"' ? DQ_SEQUENCE : SQ_SEQUENCE; if (qType == SQ_SEQUENCE) { if (Parser.isAtom(quoteBody)) qType = ATOM; int typeb = super.nextToken(); // lookahead 1 to identify what type of quote pushBack(); // nextToken() does not skip whitespaces, only readNext does so. if (typeb == '(') return new Token(quoteBody, qType | FUNCTOR); } return new Token(quoteBody, qType); } // symbols if (Arrays.binarySearch(Tokenizer.GRAPHIC_CHARS, (char) typea) >= 0) { // the symbols are parsed individually by the super.nextToken(), so accumulate symbollist StringBuffer symbols = new StringBuffer(); int typeb = typea; // String svalb = null; while (Arrays.binarySearch(Tokenizer.GRAPHIC_CHARS, (char) typeb) >= 0) { symbols.append((char) typeb); typeb = super.nextToken(); // svalb = sval; } pushBack(); // special symbols: unary + and unary - // try { // if (symbols.length() == 1 && typeb == TT_WORD && // java.lang.Long.parseLong(svalb) > 0) { // if (typea == '+') //todo, issue of handling + // and -. I don't think this is ISO.. // return readNextToken(); //skips + and returns the next // number // if (typea == '-') { // Token t = readNextToken(); //read the next number // t.seq = "-" + t.seq; //add minus to value // return t; //return token // } // } //ps. the reason why the // number isn't returned right away, but through nextToken(), is because the number might be // for instance a float // } catch (NumberFormatException e) { // } return new Token(symbols.toString(), Tokenizer.OPERATOR); } // numbers: 1. integer, 2. float if (isNumber) { try { // the various parseInt checks will throw exceptions when parts of numbers are written // illegally // 1.a. complex integers if (svala.startsWith("0")) { if (svala.indexOf('b') == 1) return new Token( "" + java.lang.Long.parseLong(svala.substring(2), 2), Tokenizer.INTEGER); // try binary if (svala.indexOf('o') == 1) return new Token( "" + java.lang.Long.parseLong(svala.substring(2), 8), Tokenizer.INTEGER); // try octal if (svala.indexOf('x') == 1) return new Token( "" + java.lang.Long.parseLong(svala.substring(2), 16), Tokenizer.INTEGER); // try hex } // lookahead 1 int typeb = super.nextToken(); String svalb = sval; // 1.b ordinary integers if (typeb != '.' && typeb != '\'') { // i.e. not float or character constant pushBack(); // lookahead 0 return new Token("" + java.lang.Long.parseLong(svala), Tokenizer.INTEGER); } // 1.c character code constant if (typeb == '\'' && "0".equals(svala)) { int typec = super.nextToken(); // lookahead 2 String svalc = sval; int intVal; if ((intVal = isCharacterCodeConstantToken(typec, svalc)) != -1) return new Token("" + intVal, Tokenizer.INTEGER); // this is an invalid character code constant int throw new InvalidTermException( "Character code constant starting with 0'<X> at line: " + super.lineno() + " cannot be recognized."); } // 2.a check that the value of the word prior to period is a valid long java.lang.Long.parseLong(svala); // throws an exception if not // 2.b first int is followed by a period if (typeb != '.') throw new InvalidTermException( "A number starting with 0-9 cannot be rcognized as an int and does not have a fraction '.' at line: " + super.lineno()); // lookahead 2 int typec = super.nextToken(); String svalc = sval; // 2.c check that the next token after '.' is a possible fraction if (typec != TT_WORD) { // if its not, the period is an End period pushBack(); // pushback 1 the token after period pushBack2 = new PushBack(typeb, svalb); // pushback 2 the period token return new Token(svala, INTEGER); // return what must be an int } // 2.d checking for exponent int exponent = svalc.indexOf("E"); if (exponent == -1) exponent = svalc.indexOf("e"); if (exponent >= 1) { // the float must have a valid exponent if (exponent == svalc.length() - 1) { // the exponent must be signed exponent int typeb2 = super.nextToken(); if (typeb2 == '+' || typeb2 == '-') { int typec2 = super.nextToken(); String svalc2 = sval; if (typec2 == TT_WORD) { // verify the remaining parts of the float and return java.lang.Long.parseLong(svalc.substring(0, exponent)); java.lang.Integer.parseInt(svalc2); return new Token(svala + "." + svalc + (char) typeb2 + svalc2, Tokenizer.FLOAT); } } } } // 2.e verify lastly that ordinary floats and unsigned exponent floats are Java legal and // return them java.lang.Double.parseDouble(svala + "." + svalc); return new Token(svala + "." + svalc, Tokenizer.FLOAT); } catch (NumberFormatException e) { // TODO return more info on what was wrong with the number given throw new InvalidTermException( "A term starting with 0-9 cannot be parsed as a number at line: " + lineno()); } } throw new InvalidTermException("Unknown Unicode character: " + typea + " (" + svala + ")"); }
/** * exprA(0) ::= integer | float | variable | atom | atom( exprA(1200) { , exprA(1200) }* ) | '[' * exprA(1200) { , exprA(1200) }* [ | exprA(1200) ] ']' | '{' [ exprA(1200) ] '}' | '(' * exprA(1200) ')' */ private Term expr0() throws InvalidTermException, IOException { Token t1 = tokenizer.readToken(); /*Castagna 06/2011*/ /* if (t1.isType(Tokenizer.INTEGER)) return Parser.parseInteger(t1.seq); //todo moved method to Number if (t1.isType(Tokenizer.FLOAT)) return Parser.parseFloat(t1.seq); //todo moved method to Number if (t1.isType(Tokenizer.VARIABLE)) return new Var(t1.seq); //todo switched to use the internal check for "_" in Var(String) */ int tempStart = tokenizer.tokenStart(); if (t1.isType(Tokenizer.INTEGER)) { Term i = Parser.parseInteger(t1.seq); map(i, tokenizer.tokenStart()); return i; // todo moved method to Number } if (t1.isType(Tokenizer.FLOAT)) { Term f = Parser.parseFloat(t1.seq); map(f, tokenizer.tokenStart()); return f; // todo moved method to Number } if (t1.isType(Tokenizer.VARIABLE)) { Term v = new Var(t1.seq); map(v, tokenizer.tokenStart()); return v; // todo switched to use the internal check for "_" in Var(String) } /**/ if (t1.isType(Tokenizer.ATOM) || t1.isType(Tokenizer.SQ_SEQUENCE) || t1.isType(Tokenizer.DQ_SEQUENCE)) { if (!t1.isFunctor()) /*Castagna 06/2011*/ { // return new Struct(t1.seq); Term f = new Struct(t1.seq); map(f, tokenizer.tokenStart()); return f; } /**/ String functor = t1.seq; Token t2 = tokenizer.readToken(); // reading left par if (!t2.isType(Tokenizer.LPAR)) throw new InvalidTermException( "Something identified as functor misses its first left parenthesis"); // todo check can // be skipped LinkedList<Term> a = expr0_arglist(); // reading arguments Token t3 = tokenizer.readToken(); if (t3.isType(Tokenizer.RPAR)) // reading right par /*Castagna 06/2011*/ { // return new Struct(functor, a); Term c = new Struct(functor, a); map(c, tempStart); return c; } /**/ /*Castagna 06/2011*/ // throw new InvalidTermException("Missing right parenthesis: ("+a + " -> here <-"); throw new InvalidTermException( "Missing right parenthesis '(" + a + "' -> here <-", tokenizer.offsetToRowColumn(getCurrentOffset())[0], tokenizer.offsetToRowColumn(getCurrentOffset())[1] - 1); /**/ } if (t1.isType(Tokenizer.LPAR)) { Term term = expr(false); if (tokenizer.readToken().isType(Tokenizer.RPAR)) return term; /*Castagna 06/2011*/ // throw new InvalidTermException("Missing right parenthesis: ("+term + " -> here <-"); throw new InvalidTermException( "Missing right parenthesis '(" + term + "' -> here <-", tokenizer.offsetToRowColumn(getCurrentOffset())[0], tokenizer.offsetToRowColumn(getCurrentOffset())[1] - 1); /**/ } if (t1.isType(Tokenizer.LBRA)) { Token t2 = tokenizer.readToken(); if (t2.isType(Tokenizer.RBRA)) return new Struct(); tokenizer.unreadToken(t2); Term term = expr0_list(); if (tokenizer.readToken().isType(Tokenizer.RBRA)) return term; /*Castagna 06/2011*/ // throw new InvalidTermException("Missing right bracket: ["+term + " -> here <-"); throw new InvalidTermException( "Missing right bracket '[" + term + " ->' here <-", tokenizer.offsetToRowColumn(getCurrentOffset())[0], tokenizer.offsetToRowColumn(getCurrentOffset())[1] - 1); /**/ } if (t1.isType(Tokenizer.LBRA2)) { Token t2 = tokenizer.readToken(); if (t2.isType(Tokenizer.RBRA2)) /*Castagna 06/2011*/ { // return new Struct("{}"); Term b = new Struct("{}"); map(b, tempStart); return b; } /**/ tokenizer.unreadToken(t2); Term arg = expr(false); t2 = tokenizer.readToken(); if (t2.isType(Tokenizer.RBRA2)) /*Castagna 06/2011*/ { // return new Struct("{}", arg); Term b = new Struct("{}", arg); map(b, tempStart); return b; } /*Castagna 06/2011*/ // throw new InvalidTermException("Missing right braces: {"+arg + " -> here <-"); throw new InvalidTermException( "Missing right braces '{" + arg + "' -> here <-", tokenizer.offsetToRowColumn(getCurrentOffset())[0], tokenizer.offsetToRowColumn(getCurrentOffset())[1] - 1); /**/ } /*Castagna 06/2011*/ // throw new InvalidTermException("The following token could not be identified: "+t1.seq); throw new InvalidTermException( "Unexpected token '" + t1.seq + "'", tokenizer.offsetToRowColumn(getCurrentOffset())[0], tokenizer.offsetToRowColumn(getCurrentOffset())[1] - 1); /**/ }