public Instance pipe(Instance carrier) { if (carrier.getData() instanceof File) { try { // get file text File file = (File) carrier.getData(); @SuppressWarnings("resource") String txt = new LineReader(new FileInputStream(file)).getText("\n"); // update instance values carrier.setData(new TokenSequence(addRegexes(txt))); carrier.setSource(txt + " [file:" + file.getName() + "]"); } catch (java.io.IOException e) { throw new IllegalArgumentException("IOException " + e); } } else if (carrier.getData() instanceof String) { String txt = (String) carrier.getData(); // update instance values carrier.setData(new TokenSequence(addRegexes(txt))); carrier.setSource(txt); } else { throw new IllegalArgumentException("must be file or string " + carrier.getData()); } return carrier; }
@Override public Instance pipe(Instance carrier) { Arg1RankInstance instance = (Arg1RankInstance) carrier; Document document = (Document) instance.getData(); List<Pair<Integer, Integer>> candidates = instance.getCandidates(); int connStart = instance.getConnStart(); int connEnd = instance.getConnEnd(); int arg2Line = instance.getArg2Line(); int arg2HeadPos = instance.getArg2HeadPos(); FeatureVector fvs[] = new FeatureVector[candidates.size()]; for (int i = 0; i < candidates.size(); i++) { Pair<Integer, Integer> candidate = candidates.get(i); PropertyList pl = null; pl = addBaselineFeatures(pl, document, candidate, arg2Line, arg2HeadPos, connStart, connEnd); pl = addConstituentFeatures( pl, document, candidate, arg2Line, arg2HeadPos, connStart, connEnd); pl = addDependencyFeatures(pl, document, candidate, arg2Line, arg2HeadPos, connStart, connEnd); // pl = addLexicoSyntacticFeatures(pl, document, candidate, arg2Line, arg2HeadPos, connStart, // connEnd); fvs[i] = new FeatureVector(getDataAlphabet(), pl, true, true); } // set target label LabelAlphabet ldict = (LabelAlphabet) getTargetAlphabet(); carrier.setTarget(ldict.lookupLabel(String.valueOf(instance.getTrueArg1Candidate()))); carrier.setData(new FeatureVectorSequence(fvs)); return carrier; }
public Instance pipe(Instance carrier) { try { if (carrier.getData() instanceof URI) carrier.setData(pipe((URI) carrier.getData())); else if (carrier.getData() instanceof File) carrier.setData(pipe((File) carrier.getData())); else if (carrier.getData() instanceof Reader) carrier.setData(pipe((Reader) carrier.getData())); else if (carrier.getData() instanceof CharSequence) ; // No conversion necessary else throw new IllegalArgumentException("Does not handle class " + carrier.getData().getClass()); } catch (java.io.IOException e) { throw new IllegalArgumentException("IOException " + e); } // System.out.println(carrier.getData().toString()); return carrier; }
public Instance pipe(Instance carrier) { if (carrier.getData() instanceof String) { String data = (String) carrier.getData(); String cleanedText = Jsoup.parse(data).text(); carrier.setData(cleanedText); } else { throw new IllegalArgumentException( "CharSequenceLowercase expects a String, found a " + carrier.getData().getClass()); } return carrier; }
public Instance pipe(Instance carrier) { try { if (carrier.getData() instanceof URI) { carrier.setData(pipe((URI) carrier.getData())); } else if (carrier.getData() instanceof File) { carrier.setData(pipe((File) carrier.getData())); } else if (carrier.getData() instanceof Reader) { carrier.setData(pipe((Reader) carrier.getData())); } else if (carrier.getData() instanceof CharSequence) ; // No conversion necessary else { throw new IllegalArgumentException("Does not handle class " + carrier.getData().getClass()); } if (this.labelsInText) { String str = ((CharSequence) carrier.getData()).toString(); if (str.startsWith("[")) { String[] labelsBoundary = str.substring(1) . // remove initial '[' split("]", 2); // separate labels and str between ']' // String[] labelStrs = labelsBoundary[0].trim().split("[ \\t]"); carrier.setData(labelsBoundary[1].trim()); carrier.setTarget(labelsBoundary[0].trim()); // homer to do } } } catch (java.io.IOException e) { throw new IllegalArgumentException("IOException " + e); } // System.out.println(carrier.getData().toString()); return carrier; }
public Instance pipe(Instance carrier) { TokenSequence ts = (TokenSequence) carrier.getData(); // xxx This doesn't seem so efficient. Perhaps have TokenSequence // use a LinkedList, and remove Tokens from it? -? // But a LinkedList implementation of TokenSequence would be quite inefficient -AKM TokenSequence ret = new TokenSequence(); Token prevToken = null; for (int i = 0; i < ts.size(); i++) { Token t = ts.get(i); if (!stoplist.contains(caseSensitive ? t.getText() : t.getText().toLowerCase())) { // xxx Should we instead make and add a copy of the Token? ret.add(t); prevToken = t; } else if (markDeletions && prevToken != null) prevToken.setProperty(FeatureSequenceWithBigrams.deletionMark, t.getText()); } carrier.setData(ret); return carrier; }
public Instance pipe(Instance carrier) { Object inputData = carrier.getData(); Alphabet features = getDataAlphabet(); LabelAlphabet labels; LabelSequence target = null; String[][] tokens; if (inputData instanceof String) tokens = parseSentence((String) inputData); else if (inputData instanceof String[][]) tokens = (String[][]) inputData; else throw new IllegalArgumentException("Not a String or String[][]; got " + inputData); FeatureVector[] fvs = new FeatureVector[tokens.length]; if (isTargetProcessing()) { labels = (LabelAlphabet) getTargetAlphabet(); target = new LabelSequence(labels, tokens.length); } for (int l = 0; l < tokens.length; l++) { int nFeatures; if (isTargetProcessing()) { if (tokens[l].length < 1) throw new IllegalStateException( "Missing label at line " + l + " instance " + carrier.getName()); nFeatures = tokens[l].length - 1; target.add(tokens[l][nFeatures]); } else nFeatures = tokens[l].length; int featureIndices[] = new int[nFeatures]; for (int f = 0; f < nFeatures; f++) featureIndices[f] = features.lookupIndex(tokens[l][f]); fvs[l] = featureInductionOption.value ? new AugmentableFeatureVector( features, featureIndices, null, featureIndices.length) : new FeatureVector(features, featureIndices); } carrier.setData(new FeatureVectorSequence(fvs)); if (isTargetProcessing()) carrier.setTarget(target); else carrier.setTarget(new LabelSequence(getTargetAlphabet())); return carrier; }
public Instance pipe(Instance carrier) { File directory = (File) carrier.getData(); carrier.setData(new FileIterator(directory, fileFilter, labelPattern)); return carrier; }
public Instance pipe(Instance carrier) { String input; if (carrier.getData() instanceof CharSequence) { input = String.valueOf(carrier.getData()); } else { throw new ClassCastException("Needed a String; got " + carrier.getData()); } String[] lines = input.split("\n"); StringSpan[] spans = new StringSpan[lines.length]; Labels[] lbls = new Labels[lines.length]; StringBuffer buf = new StringBuffer(); Alphabet dict = getDataAlphabet(); for (int i = 0; i < lines.length; i++) { String line = lines[i]; String[] toks = line.split("\\s+"); int j = 0; ArrayList thisLabels = new ArrayList(); if (!labelsAtEnd) { while (!isLabelSeparator(toks, j)) { thisLabels.add(labelForTok(toks[j], j)); j++; } if ((j < toks.length) && toks[j].equals("----")) j++; lbls[i] = new Labels((Label[]) thisLabels.toArray(new Label[thisLabels.size()])); } int maxFeatureIdx = (labelsAtEnd) ? toks.length - numLabels : toks.length; String text = "*???*"; if (featuresIncludeToken) { if (j < maxFeatureIdx) { text = toks[j++]; } } int start = buf.length(); buf.append(text); int end = buf.length(); buf.append(" "); StringSpan span = new StringSpan(buf, start, end); while (j < maxFeatureIdx) { span.setFeatureValue(toks[j].intern(), 1.0); j++; } if (includeTokenText) { span.setFeatureValue((textFeaturePrefix + text).intern(), 1.0); } if (labelsAtEnd) { int firstLblIdx = j; while (j < toks.length) { thisLabels.add(labelForTok(toks[j], j - firstLblIdx)); j++; } lbls[i] = new Labels((Label[]) thisLabels.toArray(new Label[thisLabels.size()])); } spans[i] = span; } StringTokenization tokenization = new StringTokenization(buf); tokenization.addAll(spans); carrier.setData(tokenization); carrier.setTarget(new LabelsSequence(lbls)); return carrier; }