public static double[][] extractBigrams( File file, MapSymbolTable symbolTable, TokenizerFactory tokenizerFactory, String charset) throws Exception { char[] cs = Files.readCharsFromFile(file, charset); String[] tokens = tokenizerFactory.tokenizer(cs, 0, cs.length).tokenize(); System.out.println(" Number of tokens=" + tokens.length); int[] symbols = new int[tokens.length]; for (int i = 0; i < tokens.length; ++i) { symbols[i] = Strings.allLetters(tokens[i].toCharArray()) ? symbolTable.getOrAddSymbol(tokens[i]) : -1; } int numSymbols = symbolTable.numSymbols(); System.out.println(" Number of distinct tokens=" + numSymbols); System.out.println(" #Matrix entries=" + numSymbols * numSymbols); double[][] values = new double[numSymbols][numSymbols]; for (int i = 0; i < numSymbols; ++i) Arrays.fill(values[i], 0.0); for (int i = 1; i < symbols.length; ++i) { int left = symbols[i - 1]; int right = symbols[i]; if (left >= 0 && right >= 0) values[symbols[i - 1]][symbols[i]] += 1.0; } return values; }
public double log2Estimate(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs, start, end); double logEstimate = 0.0; Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs, start, end - start); List<String> tokenList = new ArrayList<String>(); while (true) { String whitespace = tokenizer.nextWhitespace(); logEstimate += mWhitespaceModel.log2Estimate(whitespace); String token = tokenizer.nextToken(); if (token == null) break; tokenList.add(token); } // collect token ids, estimate unknown tokens int[] tokIds = new int[tokenList.size() + 2]; tokIds[0] = TokenizedLM.BOUNDARY_TOKEN; tokIds[tokIds.length - 1] = TokenizedLM.BOUNDARY_TOKEN; Iterator<String> it = tokenList.iterator(); for (int i = 1; it.hasNext(); ++i) { String token = it.next(); tokIds[i] = mSymbolTable.symbolToID(token); if (tokIds[i] < 0) { logEstimate += mUnknownTokenModel.log2Estimate(token); } } // estimate token ids for (int i = 2; i <= tokIds.length; ++i) { logEstimate += conditionalTokenEstimate(tokIds, 0, i); } return logEstimate; }
/** * Increments by the specified count all substrings of the specified character array slice up to * the maximum length specified in the constructor. * * @param cs Underlying character array. * @param start Index of first character in slice. * @param end Index of one past last character in slice. * @param count Amount to increment. * @throws IndexOutOfBoundsException If the specified start and one plus end point are not in the * bounds of character sequence. */ public void incrementSubstrings(char[] cs, int start, int end, int count) { Strings.checkArgsStartEnd(cs, start, end); // increment maximal strings and prefixes for (int i = start; i + mMaxLength <= end; ++i) incrementPrefixes(cs, i, i + mMaxLength, count); // increment short final strings and prefixes for (int i = Math.max(start, end - mMaxLength + 1); i < end; ++i) incrementPrefixes(cs, i, end, count); }
/* */ StringBuilder normalizeQuery(CharSequence cSeq) { /* 335 */ StringBuilder sb = new StringBuilder(); /* 336 */ sb.append(' '); /* 337 */ if (this.mTokenizerFactory == null) { /* 338 */ Strings.normalizeWhitespace(cSeq, sb); /* 339 */ sb.append(' '); /* */ } else { /* 341 */ char[] cs = Strings.toCharArray(cSeq); /* 342 */ Tokenizer tokenizer = this.mTokenizerFactory.tokenizer(cs, 0, cs.length); /* */ String nextToken; /* 344 */ while ((nextToken = tokenizer.nextToken()) != null) { /* 345 */ this.mTokenCounter.increment(nextToken); /* 346 */ sb.append(nextToken); /* 347 */ sb.append(' '); /* */ } /* */ } /* 350 */ return sb; /* */ }
private static Node readNode(TrieReader reader, int depth, int maxDepth) throws IOException { if (depth > maxDepth) { skipNode(reader); return null; } long count = reader.readCount(); int depthPlus1 = depth + 1; long sym1 = reader.readSymbol(); // 0+ daughters if (sym1 == -1L) return NodeFactory.createNode(count); // 1+ daughters Node node1 = readNode(reader, depthPlus1, maxDepth); long sym2 = reader.readSymbol(); if (sym2 == -1L) return NodeFactory.createNodeFold((char) sym1, node1, count); Node node2 = readNode(reader, depthPlus1, maxDepth); long sym3 = reader.readSymbol(); if (sym3 == -1L) return NodeFactory.createNode((char) sym1, node1, (char) sym2, node2, count); Node node3 = readNode(reader, depthPlus1, maxDepth); long sym4 = reader.readSymbol(); if (sym4 == -1L) return NodeFactory.createNode( (char) sym1, node1, (char) sym2, node2, (char) sym3, node3, count); Node node4 = readNode(reader, depthPlus1, maxDepth); // 4+ daughters StringBuilder cBuf = new StringBuilder(); cBuf.append((char) sym1); cBuf.append((char) sym2); cBuf.append((char) sym3); cBuf.append((char) sym4); List<Node> nodeList = new ArrayList<Node>(); nodeList.add(node1); nodeList.add(node2); nodeList.add(node3); nodeList.add(node4); long sym; while ((sym = reader.readSymbol()) != -1L) { cBuf.append((char) sym); nodeList.add(readNode(reader, depthPlus1, maxDepth)); } Node[] nodes = nodeList.toArray(EMPTY_NODE_ARRAY); char[] cs = Strings.toCharArray(cBuf); return NodeFactory.createNode(cs, nodes, count); // > 3 daughters }
public int numCharactersFollowing(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs, start, end); return mRootNode.numOutcomes(cs, start, end); }
public char[] charactersFollowing(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs, start, end); return com.aliasi.util.Arrays.copy(mRootNode.outcomes(cs, start, end)); }
public long extensionCount(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs, start, end); return mRootNode.contextCount(cs, start, end); }
/** * Decrements all of the substrings of the specified character slice by one. This method may be * used in conjunction with {@link #incrementSubstrings(char[],int,int)} to implement counts for * conditional probability estimates without affecting underlying estimates. For example, the * following code: * * <blockquote> * * <pre> * char[] cs = "abcdefghi".toCharArray(); * counter.incrementSubstrings(cs,3,7); * counter.decrementSubstrings(cs,3,5); * </pre> * * </blockquote> * * will increment the substrings of <code>"defg"</code> and then decrement the * substrings of <code>"de"</code>, causing the net effect of incrementing the counts of * substrings <code>"defg"</code>, <code>"efg"</code>, <code>"fg" * </code>, <code>"g"</code>, <code>"def"</code>, <code>"ef"</code>, * and <code>"f"</code>. This has the effect of increasing the estimate of <code>g * </code> given <code>def</code>, without increasing the estimate of <code>d</code> in an empty * context. * * @param cs Underlying array of characters in slice. * @param start Index of first character in slice. * @param end Index of one past last character in slice. * @throws IllegalArgumentException If the array slice is valid. */ public void decrementSubstrings(char[] cs, int start, int end) { Strings.checkArgsStartEnd(cs, start, end); for (int i = start; i < end; ++i) for (int j = i; j <= end; ++j) mRootNode = mRootNode.decrement(cs, i, j); }
/** * Increments the count of all prefixes of the specified character sequence up to the maximum * length specified in the constructor. * * @param cs Underlying character array. * @param start Index of first character in slice. * @param end Index of one past last character in slice. * @param count Amount to increment. * @throws IndexOutOfBoundsException If the specified start and one plus end point are not in the * bounds of character sequence. */ public void incrementPrefixes(char[] cs, int start, int end, int count) { Strings.checkArgsStartEnd(cs, start, end); mRootNode = mRootNode.increment(cs, start, end, count); }
public ParseResult parseSentence(String sentence) { String result = ""; // see if a parser socket server is available int port = new Integer(ARKref.getProperties().getProperty("parserServerPort", "5556")); String host = "127.0.0.1"; Socket client; PrintWriter pw; BufferedReader br; String line; try { client = new Socket(host, port); pw = new PrintWriter(client.getOutputStream()); br = new BufferedReader(new InputStreamReader(client.getInputStream())); pw.println(sentence); pw.flush(); // flush to complete the transmission while ((line = br.readLine()) != null) { // if(!line.matches(".*\\S.*")){ // System.out.println(); // } if (br.ready()) { line = line.replaceAll("\n", ""); line = line.replaceAll("\\s+", " "); result += line + " "; } else { lastParseScore = new Double(line); } } br.close(); pw.close(); client.close(); System.err.println("parser output:" + result); lastParse = readTreeFromString(result); boolean success = !Strings.normalizeWhitespace(result).equals("(ROOT (. .))"); return new ParseResult(success, lastParse, lastParseScore); } catch (Exception ex) { // ex.printStackTrace(); } // if socket server not available, then use a local parser object if (parser == null) { if (DEBUG) System.err.println("Could not connect to parser server. Loading parser..."); try { Options op = new Options(); String serializedInputFileOrUrl = ClassLoader.getSystemResource( ARKref.getProperties() .getProperty("parserGrammarFile", "lib/englishPCFG.ser.gz")) .toExternalForm(); parser = LexicalizedParser.loadModel(serializedInputFileOrUrl, op); // int maxLength = new Integer(ARKref.getProperties().getProperty("parserMaxLength", // "40")).intValue(); // parser.setMaxLength(maxLength); parser.setOptionFlags("-outputFormat", "oneline"); } catch (Exception e) { e.printStackTrace(); } } try { DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(sentence)); LexicalizedParserQuery query = parser.parserQuery(); if (query.parse(dp.iterator().next())) { lastParse = query.getBestParse(); lastParseScore = query.getPCFGScore(); TreePrint tp = new TreePrint("penn", "", new PennTreebankLanguagePack()); StringWriter sb = new StringWriter(); pw = new PrintWriter(sb); tp.printTree(lastParse, pw); pw.flush(); lastParse = readTreeFromString(sb.getBuffer().toString()); return new ParseResult(true, lastParse, lastParseScore); } } catch (Exception e) { } lastParse = readTreeFromString("(ROOT (. .))"); lastParseScore = -99999.0; return new ParseResult(false, lastParse, lastParseScore); }
// next two method cut-and-pasted from TokenizedLM public double log2Estimate(CharSequence cSeq) { char[] cs = Strings.toCharArray(cSeq); return log2Estimate(cs, 0, cs.length); }