public static double[][] extractBigrams(
      File file, MapSymbolTable symbolTable, TokenizerFactory tokenizerFactory, String charset)
      throws Exception {

    char[] cs = Files.readCharsFromFile(file, charset);
    String[] tokens = tokenizerFactory.tokenizer(cs, 0, cs.length).tokenize();
    System.out.println("    Number of tokens=" + tokens.length);

    int[] symbols = new int[tokens.length];
    for (int i = 0; i < tokens.length; ++i) {
      symbols[i] =
          Strings.allLetters(tokens[i].toCharArray()) ? symbolTable.getOrAddSymbol(tokens[i]) : -1;
    }

    int numSymbols = symbolTable.numSymbols();
    System.out.println("    Number of distinct tokens=" + numSymbols);
    System.out.println("    #Matrix entries=" + numSymbols * numSymbols);

    double[][] values = new double[numSymbols][numSymbols];
    for (int i = 0; i < numSymbols; ++i) Arrays.fill(values[i], 0.0);

    for (int i = 1; i < symbols.length; ++i) {
      int left = symbols[i - 1];
      int right = symbols[i];
      if (left >= 0 && right >= 0) values[symbols[i - 1]][symbols[i]] += 1.0;
    }

    return values;
  }
예제 #2
0
  public double log2Estimate(char[] cs, int start, int end) {
    Strings.checkArgsStartEnd(cs, start, end);
    double logEstimate = 0.0;

    Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs, start, end - start);
    List<String> tokenList = new ArrayList<String>();
    while (true) {
      String whitespace = tokenizer.nextWhitespace();
      logEstimate += mWhitespaceModel.log2Estimate(whitespace);
      String token = tokenizer.nextToken();
      if (token == null) break;
      tokenList.add(token);
    }

    // collect token ids, estimate unknown tokens
    int[] tokIds = new int[tokenList.size() + 2];
    tokIds[0] = TokenizedLM.BOUNDARY_TOKEN;
    tokIds[tokIds.length - 1] = TokenizedLM.BOUNDARY_TOKEN;
    Iterator<String> it = tokenList.iterator();
    for (int i = 1; it.hasNext(); ++i) {
      String token = it.next();
      tokIds[i] = mSymbolTable.symbolToID(token);
      if (tokIds[i] < 0) {
        logEstimate += mUnknownTokenModel.log2Estimate(token);
      }
    }

    // estimate token ids
    for (int i = 2; i <= tokIds.length; ++i) {
      logEstimate += conditionalTokenEstimate(tokIds, 0, i);
    }
    return logEstimate;
  }
 /**
  * Increments by the specified count all substrings of the specified character array slice up to
  * the maximum length specified in the constructor.
  *
  * @param cs Underlying character array.
  * @param start Index of first character in slice.
  * @param end Index of one past last character in slice.
  * @param count Amount to increment.
  * @throws IndexOutOfBoundsException If the specified start and one plus end point are not in the
  *     bounds of character sequence.
  */
 public void incrementSubstrings(char[] cs, int start, int end, int count) {
   Strings.checkArgsStartEnd(cs, start, end);
   // increment maximal strings and prefixes
   for (int i = start; i + mMaxLength <= end; ++i) incrementPrefixes(cs, i, i + mMaxLength, count);
   // increment short final strings and prefixes
   for (int i = Math.max(start, end - mMaxLength + 1); i < end; ++i)
     incrementPrefixes(cs, i, end, count);
 }
 /*     */ StringBuilder normalizeQuery(CharSequence cSeq) {
   /* 335 */ StringBuilder sb = new StringBuilder();
   /* 336 */ sb.append(' ');
   /* 337 */ if (this.mTokenizerFactory == null) {
     /* 338 */ Strings.normalizeWhitespace(cSeq, sb);
     /* 339 */ sb.append(' ');
     /*     */ } else {
     /* 341 */ char[] cs = Strings.toCharArray(cSeq);
     /* 342 */ Tokenizer tokenizer = this.mTokenizerFactory.tokenizer(cs, 0, cs.length);
     /*     */ String nextToken;
     /* 344 */ while ((nextToken = tokenizer.nextToken()) != null) {
       /* 345 */ this.mTokenCounter.increment(nextToken);
       /* 346 */ sb.append(nextToken);
       /* 347 */ sb.append(' ');
       /*     */ }
     /*     */ }
   /* 350 */ return sb;
   /*     */ }
  private static Node readNode(TrieReader reader, int depth, int maxDepth) throws IOException {

    if (depth > maxDepth) {
      skipNode(reader);
      return null;
    }

    long count = reader.readCount();

    int depthPlus1 = depth + 1;

    long sym1 = reader.readSymbol();

    // 0+ daughters
    if (sym1 == -1L) return NodeFactory.createNode(count);

    // 1+ daughters
    Node node1 = readNode(reader, depthPlus1, maxDepth);
    long sym2 = reader.readSymbol();
    if (sym2 == -1L) return NodeFactory.createNodeFold((char) sym1, node1, count);

    Node node2 = readNode(reader, depthPlus1, maxDepth);
    long sym3 = reader.readSymbol();
    if (sym3 == -1L) return NodeFactory.createNode((char) sym1, node1, (char) sym2, node2, count);

    Node node3 = readNode(reader, depthPlus1, maxDepth);
    long sym4 = reader.readSymbol();
    if (sym4 == -1L)
      return NodeFactory.createNode(
          (char) sym1, node1, (char) sym2, node2, (char) sym3, node3, count);
    Node node4 = readNode(reader, depthPlus1, maxDepth);

    // 4+ daughters
    StringBuilder cBuf = new StringBuilder();
    cBuf.append((char) sym1);
    cBuf.append((char) sym2);
    cBuf.append((char) sym3);
    cBuf.append((char) sym4);

    List<Node> nodeList = new ArrayList<Node>();
    nodeList.add(node1);
    nodeList.add(node2);
    nodeList.add(node3);
    nodeList.add(node4);

    long sym;

    while ((sym = reader.readSymbol()) != -1L) {
      cBuf.append((char) sym);
      nodeList.add(readNode(reader, depthPlus1, maxDepth));
    }
    Node[] nodes = nodeList.toArray(EMPTY_NODE_ARRAY);
    char[] cs = Strings.toCharArray(cBuf);
    return NodeFactory.createNode(cs, nodes, count); // > 3 daughters
  }
 public int numCharactersFollowing(char[] cs, int start, int end) {
   Strings.checkArgsStartEnd(cs, start, end);
   return mRootNode.numOutcomes(cs, start, end);
 }
 public char[] charactersFollowing(char[] cs, int start, int end) {
   Strings.checkArgsStartEnd(cs, start, end);
   return com.aliasi.util.Arrays.copy(mRootNode.outcomes(cs, start, end));
 }
 public long extensionCount(char[] cs, int start, int end) {
   Strings.checkArgsStartEnd(cs, start, end);
   return mRootNode.contextCount(cs, start, end);
 }
 /**
  * Decrements all of the substrings of the specified character slice by one. This method may be
  * used in conjunction with {@link #incrementSubstrings(char[],int,int)} to implement counts for
  * conditional probability estimates without affecting underlying estimates. For example, the
  * following code:
  *
  * <blockquote>
  *
  * <pre>
  * char[] cs = &quot;abcdefghi&quot;.toCharArray();
  * counter.incrementSubstrings(cs,3,7);
  * counter.decrementSubstrings(cs,3,5);
  * </pre>
  *
  * </blockquote>
  *
  * will increment the substrings of <code>&quot;defg&quot;</code> and then decrement the
  * substrings of <code>&quot;de&quot;</code>, causing the net effect of incrementing the counts of
  * substrings <code>&quot;defg&quot;</code>, <code>&quot;efg&quot;</code>, <code>&quot;fg&quot;
  * </code>, <code>&quot;g&quot;</code>, <code>&quot;def&quot;</code>, <code>&quot;ef&quot;</code>,
  * and <code>&quot;f&quot;</code>. This has the effect of increasing the estimate of <code>g
  * </code> given <code>def</code>, without increasing the estimate of <code>d</code> in an empty
  * context.
  *
  * @param cs Underlying array of characters in slice.
  * @param start Index of first character in slice.
  * @param end Index of one past last character in slice.
  * @throws IllegalArgumentException If the array slice is valid.
  */
 public void decrementSubstrings(char[] cs, int start, int end) {
   Strings.checkArgsStartEnd(cs, start, end);
   for (int i = start; i < end; ++i)
     for (int j = i; j <= end; ++j) mRootNode = mRootNode.decrement(cs, i, j);
 }
 /**
  * Increments the count of all prefixes of the specified character sequence up to the maximum
  * length specified in the constructor.
  *
  * @param cs Underlying character array.
  * @param start Index of first character in slice.
  * @param end Index of one past last character in slice.
  * @param count Amount to increment.
  * @throws IndexOutOfBoundsException If the specified start and one plus end point are not in the
  *     bounds of character sequence.
  */
 public void incrementPrefixes(char[] cs, int start, int end, int count) {
   Strings.checkArgsStartEnd(cs, start, end);
   mRootNode = mRootNode.increment(cs, start, end, count);
 }
  public ParseResult parseSentence(String sentence) {
    String result = "";

    // see if a parser socket server is available
    int port = new Integer(ARKref.getProperties().getProperty("parserServerPort", "5556"));
    String host = "127.0.0.1";
    Socket client;
    PrintWriter pw;
    BufferedReader br;
    String line;
    try {
      client = new Socket(host, port);

      pw = new PrintWriter(client.getOutputStream());
      br = new BufferedReader(new InputStreamReader(client.getInputStream()));
      pw.println(sentence);
      pw.flush(); // flush to complete the transmission
      while ((line = br.readLine()) != null) {
        // if(!line.matches(".*\\S.*")){
        //        System.out.println();
        // }
        if (br.ready()) {
          line = line.replaceAll("\n", "");
          line = line.replaceAll("\\s+", " ");
          result += line + " ";
        } else {
          lastParseScore = new Double(line);
        }
      }

      br.close();
      pw.close();
      client.close();

      System.err.println("parser output:" + result);

      lastParse = readTreeFromString(result);
      boolean success = !Strings.normalizeWhitespace(result).equals("(ROOT (. .))");
      return new ParseResult(success, lastParse, lastParseScore);
    } catch (Exception ex) {

      // ex.printStackTrace();
    }

    // if socket server not available, then use a local parser object
    if (parser == null) {
      if (DEBUG) System.err.println("Could not connect to parser server.  Loading parser...");
      try {
        Options op = new Options();
        String serializedInputFileOrUrl =
            ClassLoader.getSystemResource(
                    ARKref.getProperties()
                        .getProperty("parserGrammarFile", "lib/englishPCFG.ser.gz"))
                .toExternalForm();
        parser = LexicalizedParser.loadModel(serializedInputFileOrUrl, op);
        //				int maxLength = new Integer(ARKref.getProperties().getProperty("parserMaxLength",
        // "40")).intValue();
        //				parser.setMaxLength(maxLength);
        parser.setOptionFlags("-outputFormat", "oneline");
      } catch (Exception e) {
        e.printStackTrace();
      }
    }

    try {
      DocumentPreprocessor dp = new DocumentPreprocessor(new StringReader(sentence));

      LexicalizedParserQuery query = parser.parserQuery();

      if (query.parse(dp.iterator().next())) {
        lastParse = query.getBestParse();
        lastParseScore = query.getPCFGScore();
        TreePrint tp = new TreePrint("penn", "", new PennTreebankLanguagePack());
        StringWriter sb = new StringWriter();
        pw = new PrintWriter(sb);
        tp.printTree(lastParse, pw);
        pw.flush();
        lastParse = readTreeFromString(sb.getBuffer().toString());

        return new ParseResult(true, lastParse, lastParseScore);
      }
    } catch (Exception e) {
    }

    lastParse = readTreeFromString("(ROOT (. .))");
    lastParseScore = -99999.0;
    return new ParseResult(false, lastParse, lastParseScore);
  }
예제 #12
0
 // next two method cut-and-pasted from TokenizedLM
 public double log2Estimate(CharSequence cSeq) {
   char[] cs = Strings.toCharArray(cSeq);
   return log2Estimate(cs, 0, cs.length);
 }