private CharArraySet remove(CharArraySet fromSet, char[] charArray) { // System.out.println( "remove from: " + new String( charArray )); CharArraySet newSet = new CharArraySet(5, false); Iterator<Object> phraseIt = currentSetToCheck.iterator(); while (phraseIt != null && phraseIt.hasNext()) { char[] phrase = (char[]) phraseIt.next(); // if (!equals( phrase, charArray) && (startsWith( charArray, phrase ) || endsWith( charArray, // phrase))) { if (!equals(phrase, charArray) && startsWith(phrase, charArray) || endsWith(charArray, phrase)) { newSet.add(phrase); } else { // System.out.println( "removing " + new String( phrase )); } } return newSet; }
private CharArrayMap convertPhraseSet(CharArraySet phraseSet) { CharArrayMap<CharArraySet> phraseMap = new CharArrayMap(100, false); Iterator<Object> phraseIt = phraseSet.iterator(); while (phraseIt != null && phraseIt.hasNext()) { char[] phrase = (char[]) phraseIt.next(); Log.debug("'" + new String(phrase) + "'"); char[] firstTerm = getFirstTerm(phrase); Log.debug("'" + new String(firstTerm) + "'"); CharArraySet itsPhrases = phraseMap.get(firstTerm, 0, firstTerm.length); if (itsPhrases == null) { itsPhrases = new CharArraySet(5, false); phraseMap.put(new String(firstTerm), itsPhrases); } itsPhrases.add(phrase); } return phraseMap; }
@Override public boolean incrementToken() throws IOException { if (!emitSingleTokens && unusedTokens.size() > 0) { Log.debug("emitting unused phrases"); // emit these until the queue is empty before emitting any new stuff Token aToken = unusedTokens.remove(0); emit(aToken); return true; } if (lastToken != null) { emit(lastToken); lastToken = null; return true; } char[] nextToken = nextToken(); // if (nextToken != null) System.out.println( "nextToken: " + new String( nextToken )); if (nextToken == null) { if (lastValid != null) { emit(lastValid); lastValid = null; return true; } if (emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) { char[] phrase = getFirst(currentSetToCheck); char[] lastTok = getCurrentBuffer(new char[0]); if (phrase != null && endsWith(lastTok, phrase)) { currentSetToCheck = remove(currentSetToCheck, phrase); emit(phrase); return true; } } else if (!emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) { if (lastEmitted != null && !equals(fixWhitespace(lastEmitted), getCurrentBuffer(new char[0]))) { discardCharTokens(currentPhrase, unusedTokens); currentSetToCheck = null; if (unusedTokens.size() > 0) { Token aToken = unusedTokens.remove(0); Log.debug("emitting putback token"); emit(aToken); return true; } } } if (lastEmitted == null && (currentPhrase != null && currentPhrase.length() > 0)) { char[] lastTok = getCurrentBuffer(new char[0]); if (currentSetToCheck.contains(lastTok, 0, lastTok.length)) { emit(lastTok); currentPhrase.setLength(0); return true; } else if (!emitSingleTokens) { discardCharTokens(currentPhrase, unusedTokens); currentSetToCheck = null; currentPhrase.setLength(0); if (unusedTokens.size() > 0) { Token aToken = unusedTokens.remove(0); Log.debug("emitting putback token"); emit(aToken); return true; } } } return false; } // if emitSingleToken, set lastToken = nextToken if (emitSingleTokens) { lastToken = nextToken; } if (currentSetToCheck == null || currentSetToCheck.size() == 0) { Log.debug("Checking for phrase start on '" + new String(nextToken) + "'"); if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) { // get the phrase set for this token, add it to currentSetTocheck currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length); if (currentPhrase == null) currentPhrase = new StringBuffer(); else currentPhrase.setLength(0); currentPhrase.append(nextToken); return incrementToken(); } else { emit(nextToken); // clear lastToken lastToken = null; return true; } } else { // add token to the current string buffer. char[] currentBuffer = getCurrentBuffer(nextToken); if (currentSetToCheck.contains(currentBuffer, 0, currentBuffer.length)) { // if its the only one valid, emit it // if there is a longer one, wait to see if it will be matched // if the longer one breaks on the next token, emit this one... // emit the current phrase currentSetToCheck = remove(currentSetToCheck, currentBuffer); if (currentSetToCheck.size() == 0) { emit(currentBuffer); lastValid = null; --positionIncr; } else { if (emitSingleTokens) { lastToken = currentBuffer; return true; } lastValid = currentBuffer; } if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) { // get the phrase set for this token, add it to currentPhrasesTocheck currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length); if (currentPhrase == null) currentPhrase = new StringBuffer(); else currentPhrase.setLength(0); currentPhrase.append(nextToken); } return (lastValid != null) ? incrementToken() : true; } if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) { // get the phrase set for this token, add it to currentPhrasesTocheck // System.out.println( "starting new phrase with " + new String( nextToken ) ); // does this add all of the set? if not need iterator loop CharArraySet newSet = phraseMap.get(nextToken, 0, nextToken.length); Iterator<Object> phraseIt = newSet.iterator(); while (phraseIt != null && phraseIt.hasNext()) { char[] phrase = (char[]) phraseIt.next(); currentSetToCheck.add(phrase); } } // for each phrase in currentSetToCheck - // if there is a phrase prefix match, get the next token recursively Iterator<Object> phraseIt = currentSetToCheck.iterator(); while (phraseIt != null && phraseIt.hasNext()) { char[] phrase = (char[]) phraseIt.next(); if (startsWith(phrase, currentBuffer)) { return incrementToken(); } } if (lastValid != null) { emit(lastValid); lastValid = null; return true; } if (!emitSingleTokens) { // current phrase didn't match fully: put the tokens back // into the unusedTokens list discardCharTokens(currentPhrase, unusedTokens); currentPhrase.setLength(0); currentSetToCheck = null; if (unusedTokens.size() > 0) { Token aToken = unusedTokens.remove(0); Log.debug("emitting putback token"); emit(aToken); return true; } } currentSetToCheck = null; Log.debug("returning at end."); return incrementToken(); } }
private char[] getFirst(CharArraySet charSet) { if (charSet.isEmpty()) return null; Iterator<Object> phraseIt = charSet.iterator(); return (char[]) phraseIt.next(); }