/** * Process each sentence to add the feature if necessary. * * @param carrier Instance to be processed. * @return Instance with new features. */ public Instance pipe(Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); for (int i = 0; i < ts.size(); i++) { Token t = ts.get(i); char[] text = t.getText().toCharArray(); int numDigit = 0; for (int k = 0; k < text.length; k++) { if (Character.isDigit(text[k])) { numDigit++; } } if (numDigit == 1) { t.setFeatureValue("SingleDigit", 1.0); } else if (numDigit == 2) { t.setFeatureValue("TwoDigit", 1.0); } else if (numDigit == 3) { t.setFeatureValue("ThreeDigit", 1.0); } else if (numDigit >= 4) { t.setFeatureValue("MoreDigit", 1.0); } } return carrier; }
/** * Generate an instance from the text covered by the given annotation. * * @param annotation an annotation representing a document segment, e.g. {@link Sentence}. * @param tokenType the type to use for representing tokens, usually {@link Token}, but could also * be any other type. * @return */ private TokenSequence generateTokenSequence(AnnotationFS annotation, Type tokenType) { TokenSequence tokenSequence = new TokenSequence(); for (AnnotationFS token : CasUtil.selectCovered(tokenType, annotation)) { for (String tokenText : getTokensFromAnnotation(token, useLemma, minTokenLength)) { tokenSequence.add(tokenText); } } return tokenSequence; }
/** * Generate a TokenSequence from the whole document. * * @param aJCas a CAS holding the document * @param tokenType this type will be used as token, e.g. Token, N-gram etc. * @param useLemma if this is true, use lemmas * @param minTokenLength the minimum token length to use * @return a {@link TokenSequence} * @throws FeaturePathException if the annotation type specified in {@code PARAM_TYPE_NAME} cannot * be extracted. */ protected static TokenSequence generateTokenSequence( JCas aJCas, Type tokenType, boolean useLemma, int minTokenLength) throws FeaturePathException { TokenSequence tokenSequence = new TokenSequence(); for (AnnotationFS token : CasUtil.select(aJCas.getCas(), tokenType)) { for (String tokenText : getTokensFromAnnotation(token, useLemma, minTokenLength)) { tokenSequence.add(tokenText); } } return tokenSequence; }
public Instance pipe(Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); for (int i = 0; i < ts.size(); i++) { Token t = ts.get(i); String s = t.getText(); if (distinguishBorders) s = startBorderChar + s + endBorderChar; int slen = s.length(); for (int j = 0; j < gramSizes.length; j++) { int size = gramSizes[j]; for (int k = 0; k < (slen - size) + 1; k++) t.setFeatureValue((prefix + s.substring(k, k + size)).intern(), 1.0); } } return carrier; }
public Instance pipe(Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); for (int i = 0; i < ts.size(); i++) { Token t = ts.get(i); String s = t.getText(); if (distinguishBorders) s = startBorderChar + s + endBorderChar; int slen = s.length(); for (int j = 0; j < gramSizes.length; j++) { int size = gramSizes[j]; for (int k = 0; k < slen - size; k++) t.setFeatureValue( s.substring(k, k + size), 1.0); // original was substring(k, size), changed by Fuchun } } return carrier; }
public Instance pipe(Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); // xxx This doesn't seem so efficient. Perhaps have TokenSequence // use a LinkedList, and remove Tokens from it? -? // But a LinkedList implementation of TokenSequence would be quite inefficient -AKM TokenSequence ret = new TokenSequence(); Token prevToken = null; for (int i = 0; i < ts.size(); i++) { Token t = ts.get(i); if (!stoplist.contains(caseSensitive ? t.getText() : t.getText().toLowerCase())) { // xxx Should we instead make and add a copy of the Token? ret.add(t); prevToken = t; } else if (markDeletions && prevToken != null) prevToken.setProperty(FeatureSequenceWithBigrams.deletionMark, t.getText()); } carrier.setData(ret); return carrier; }
public Instance pipe(Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); TokenSequence targets = carrier.getTarget() instanceof TokenSequence ? (TokenSequence) carrier.getTarget() : null; TokenSequence source = carrier.getSource() instanceof TokenSequence ? (TokenSequence) carrier.getSource() : null; StringBuffer sb = new StringBuffer(); if (prefix != null) sb.append(prefix); sb.append("name: " + carrier.getName() + "\n"); for (int i = 0; i < ts.size(); i++) { if (source != null) { sb.append(source.get(i).getText()); sb.append(' '); } if (carrier.getTarget() instanceof TokenSequence) { sb.append(((TokenSequence) carrier.getTarget()).get(i).getText()); sb.append(' '); } if (carrier.getTarget() instanceof FeatureSequence) { sb.append(((FeatureSequence) carrier.getTarget()).getObjectAtPosition(i).toString()); sb.append(' '); } PropertyList pl = ts.get(i).getFeatures(); if (pl != null) { PropertyList.Iterator iter = pl.iterator(); while (iter.hasNext()) { iter.next(); double v = iter.getNumericValue(); if (v == 1.0) sb.append(iter.getKey()); else sb.append(iter.getKey() + '=' + v); sb.append(' '); } } sb.append('\n'); } System.out.print(sb.toString()); return carrier; }
public Instance pipe(Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); int tsSize = ts.size(); PropertyList[] oldfs = new PropertyList[ts.size()]; PropertyList[] newfs = new PropertyList[ts.size()]; for (int i = 0; i < tsSize; i++) oldfs[i] = ts.get(i).getFeatures(); if (includeOriginalSingletons) for (int i = 0; i < tsSize; i++) newfs[i] = ts.get(i).getFeatures(); for (int i = 0; i < ts.size(); i++) { // System.out.println ("OffsetPropertyConjunctions: ts index="+i+", conjunction ="); conjunctionList: for (int j = 0; j < conjunctions.length; j++) { // Make sure that the offsets in the conjunction are all available at this position for (int k = 0; k < conjunctions[j].length; k++) { if (conjunctions[j][k] + i < 0 || conjunctions[j][k] + i > tsSize - 1 || oldfs[i + conjunctions[j][k]] == null) continue conjunctionList; // System.out.print (" "+conjunctions[j][k]); } // System.out.print ("\n"); // Add the features for this conjunction if (conjunctions[j].length == 1) { int offset = conjunctions[j][0]; if (offset == 0 && includeOriginalSingletons) throw new IllegalArgumentException("Original singletons already there."); PropertyList.Iterator iter = oldfs[i + offset].iterator(); while (iter.hasNext()) { iter.next(); if (propertyKey != null && !propertyKey.equals(iter.getKey())) continue; String key = iter.getKey() + (offset == 0 ? "" : "@" + offset); newfs[i] = PropertyList.add(key, iter.getNumericValue(), newfs[i]); } } else if (conjunctions[j].length == 2) { // System.out.println ("token="+ts.getToken(i).getText()+" conjunctionIndex="+j); int offset0 = conjunctions[j][0]; int offset1 = conjunctions[j][1]; PropertyList.Iterator iter0 = oldfs[i + offset0].iterator(); int iter0i = -1; while (iter0.hasNext()) { iter0i++; iter0.next(); if (propertyKey != null && !propertyKey.equals(iter0.getKey())) continue; PropertyList.Iterator iter1 = oldfs[i + offset1].iterator(); int iter1i = -1; while (iter1.hasNext()) { iter1i++; iter1.next(); if (propertyKey != null && !propertyKey.equals(iter1.getKey())) continue; // Avoid redundant doubling of feature space; include only upper triangle // System.out.println ("off0="+offset0+" off1="+offset1+" iter0i="+iter0i+" // iter1i="+iter1i); if (offset0 == offset1 && iter1i <= iter0i) continue; // System.out.println (">off0="+offset0+" off1="+offset1+" iter0i="+iter0i+" // iter1i="+iter1i); String key = iter0.getKey() + (offset0 == 0 ? "" : "@" + offset0) + "&" + iter1.getKey() + (offset1 == 0 ? "" : "@" + offset1); newfs[i] = PropertyList.add( key, iter0.getNumericValue() * iter1.getNumericValue(), newfs[i]); } } } else if (conjunctions[j].length == 3) { int offset0 = conjunctions[j][0]; int offset1 = conjunctions[j][1]; int offset2 = conjunctions[j][2]; PropertyList.Iterator iter0 = oldfs[i + offset0].iterator(); int iter0i = -1; while (iter0.hasNext()) { iter0i++; iter0.next(); if (propertyKey != null && !propertyKey.equals(iter0.getKey())) continue; PropertyList.Iterator iter1 = oldfs[i + offset1].iterator(); int iter1i = -1; while (iter1.hasNext()) { iter1i++; iter1.next(); if (propertyKey != null && !propertyKey.equals(iter1.getKey())) continue; // Avoid redundant doubling of feature space; include only upper triangle if (offset0 == offset1 && iter1i <= iter0i) continue; PropertyList.Iterator iter2 = oldfs[i + offset2].iterator(); int iter2i = -1; while (iter2.hasNext()) { iter2i++; iter2.next(); if (propertyKey != null && !propertyKey.equals(iter2.getKey())) continue; // Avoid redundant doubling of feature space; include only upper triangle if (offset1 == offset2 && iter2i <= iter1i) continue; String key = iter0.getKey() + (offset0 == 0 ? "" : "@" + offset0) + "&" + iter1.getKey() + (offset1 == 0 ? "" : "@" + offset1) + "&" + iter2.getKey() + (offset2 == 0 ? "" : "@" + offset2); newfs[i] = PropertyList.add( key, iter0.getNumericValue() * iter1.getNumericValue() * iter2.getNumericValue(), newfs[i]); } } } } else { throw new UnsupportedOperationException( "Conjunctions of length 4 or more not yet implemented."); } } } // Put the new PropertyLists in place for (int i = 0; i < ts.size(); i++) ts.get(i).setFeatures(newfs[i]); return carrier; }