public Instance pipe(Instance carrier) {

    if (carrier.getData() instanceof File) {
      try {
        // get file text
        File file = (File) carrier.getData();
        @SuppressWarnings("resource")
        String txt = new LineReader(new FileInputStream(file)).getText("\n");
        // update instance values
        carrier.setData(new TokenSequence(addRegexes(txt)));
        carrier.setSource(txt + " [file:" + file.getName() + "]");
      } catch (java.io.IOException e) {
        throw new IllegalArgumentException("IOException " + e);
      }

    } else if (carrier.getData() instanceof String) {
      String txt = (String) carrier.getData();
      // update instance values
      carrier.setData(new TokenSequence(addRegexes(txt)));
      carrier.setSource(txt);

    } else {
      throw new IllegalArgumentException("must be file or string " + carrier.getData());
    }
    return carrier;
  }
  @Override
  public Instance pipe(Instance carrier) {
    Arg1RankInstance instance = (Arg1RankInstance) carrier;

    Document document = (Document) instance.getData();
    List<Pair<Integer, Integer>> candidates = instance.getCandidates();
    int connStart = instance.getConnStart();
    int connEnd = instance.getConnEnd();
    int arg2Line = instance.getArg2Line();
    int arg2HeadPos = instance.getArg2HeadPos();

    FeatureVector fvs[] = new FeatureVector[candidates.size()];

    for (int i = 0; i < candidates.size(); i++) {
      Pair<Integer, Integer> candidate = candidates.get(i);
      PropertyList pl = null;
      pl = addBaselineFeatures(pl, document, candidate, arg2Line, arg2HeadPos, connStart, connEnd);
      pl =
          addConstituentFeatures(
              pl, document, candidate, arg2Line, arg2HeadPos, connStart, connEnd);
      pl =
          addDependencyFeatures(pl, document, candidate, arg2Line, arg2HeadPos, connStart, connEnd);
      // pl = addLexicoSyntacticFeatures(pl, document, candidate, arg2Line, arg2HeadPos, connStart,
      // connEnd);

      fvs[i] = new FeatureVector(getDataAlphabet(), pl, true, true);
    }

    // set target label
    LabelAlphabet ldict = (LabelAlphabet) getTargetAlphabet();
    carrier.setTarget(ldict.lookupLabel(String.valueOf(instance.getTrueArg1Candidate())));
    carrier.setData(new FeatureVectorSequence(fvs));

    return carrier;
  }
  public Instance pipe(Instance carrier) {
    try {
      if (carrier.getData() instanceof URI) carrier.setData(pipe((URI) carrier.getData()));
      else if (carrier.getData() instanceof File) carrier.setData(pipe((File) carrier.getData()));
      else if (carrier.getData() instanceof Reader)
        carrier.setData(pipe((Reader) carrier.getData()));
      else if (carrier.getData() instanceof CharSequence) ; // No conversion necessary
      else
        throw new IllegalArgumentException("Does not handle class " + carrier.getData().getClass());

    } catch (java.io.IOException e) {
      throw new IllegalArgumentException("IOException " + e);
    }

    //		System.out.println(carrier.getData().toString());
    return carrier;
  }
  public Instance pipe(Instance carrier) {

    if (carrier.getData() instanceof String) {
      String data = (String) carrier.getData();
      String cleanedText = Jsoup.parse(data).text();
      carrier.setData(cleanedText);
    } else {
      throw new IllegalArgumentException(
          "CharSequenceLowercase expects a String, found a " + carrier.getData().getClass());
    }

    return carrier;
  }
  public Instance pipe(Instance carrier) {
    try {
      if (carrier.getData() instanceof URI) {
        carrier.setData(pipe((URI) carrier.getData()));
      } else if (carrier.getData() instanceof File) {
        carrier.setData(pipe((File) carrier.getData()));
      } else if (carrier.getData() instanceof Reader) {
        carrier.setData(pipe((Reader) carrier.getData()));
      } else if (carrier.getData() instanceof CharSequence) ; // No conversion necessary
      else {
        throw new IllegalArgumentException("Does not handle class " + carrier.getData().getClass());
      }

      if (this.labelsInText) {
        String str = ((CharSequence) carrier.getData()).toString();

        if (str.startsWith("[")) {
          String[] labelsBoundary =
              str.substring(1)
                  . // remove initial '['
                  split("]", 2); // separate labels and str between ']'

          // String[] labelStrs = labelsBoundary[0].trim().split("[ \\t]");

          carrier.setData(labelsBoundary[1].trim());

          carrier.setTarget(labelsBoundary[0].trim()); // homer to do
        }
      }

    } catch (java.io.IOException e) {
      throw new IllegalArgumentException("IOException " + e);
    }

    //		System.out.println(carrier.getData().toString());
    return carrier;
  }
 public Instance pipe(Instance carrier) {
   TokenSequence ts = (TokenSequence) carrier.getData();
   // xxx This doesn't seem so efficient.  Perhaps have TokenSequence
   // use a LinkedList, and remove Tokens from it? -?
   // But a LinkedList implementation of TokenSequence would be quite inefficient -AKM
   TokenSequence ret = new TokenSequence();
   Token prevToken = null;
   for (int i = 0; i < ts.size(); i++) {
     Token t = ts.get(i);
     if (!stoplist.contains(caseSensitive ? t.getText() : t.getText().toLowerCase())) {
       // xxx Should we instead make and add a copy of the Token?
       ret.add(t);
       prevToken = t;
     } else if (markDeletions && prevToken != null)
       prevToken.setProperty(FeatureSequenceWithBigrams.deletionMark, t.getText());
   }
   carrier.setData(ret);
   return carrier;
 }
Exemple #7
0
 public Instance pipe(Instance carrier) {
   Object inputData = carrier.getData();
   Alphabet features = getDataAlphabet();
   LabelAlphabet labels;
   LabelSequence target = null;
   String[][] tokens;
   if (inputData instanceof String) tokens = parseSentence((String) inputData);
   else if (inputData instanceof String[][]) tokens = (String[][]) inputData;
   else throw new IllegalArgumentException("Not a String or String[][]; got " + inputData);
   FeatureVector[] fvs = new FeatureVector[tokens.length];
   if (isTargetProcessing()) {
     labels = (LabelAlphabet) getTargetAlphabet();
     target = new LabelSequence(labels, tokens.length);
   }
   for (int l = 0; l < tokens.length; l++) {
     int nFeatures;
     if (isTargetProcessing()) {
       if (tokens[l].length < 1)
         throw new IllegalStateException(
             "Missing label at line " + l + " instance " + carrier.getName());
       nFeatures = tokens[l].length - 1;
       target.add(tokens[l][nFeatures]);
     } else nFeatures = tokens[l].length;
     int featureIndices[] = new int[nFeatures];
     for (int f = 0; f < nFeatures; f++) featureIndices[f] = features.lookupIndex(tokens[l][f]);
     fvs[l] =
         featureInductionOption.value
             ? new AugmentableFeatureVector(
                 features, featureIndices, null, featureIndices.length)
             : new FeatureVector(features, featureIndices);
   }
   carrier.setData(new FeatureVectorSequence(fvs));
   if (isTargetProcessing()) carrier.setTarget(target);
   else carrier.setTarget(new LabelSequence(getTargetAlphabet()));
   return carrier;
 }
 public Instance pipe(Instance carrier) {
   File directory = (File) carrier.getData();
   carrier.setData(new FileIterator(directory, fileFilter, labelPattern));
   return carrier;
 }
  public Instance pipe(Instance carrier) {
    String input;
    if (carrier.getData() instanceof CharSequence) {
      input = String.valueOf(carrier.getData());
    } else {
      throw new ClassCastException("Needed a String; got " + carrier.getData());
    }

    String[] lines = input.split("\n");

    StringSpan[] spans = new StringSpan[lines.length];
    Labels[] lbls = new Labels[lines.length];
    StringBuffer buf = new StringBuffer();

    Alphabet dict = getDataAlphabet();

    for (int i = 0; i < lines.length; i++) {
      String line = lines[i];
      String[] toks = line.split("\\s+");

      int j = 0;
      ArrayList thisLabels = new ArrayList();
      if (!labelsAtEnd) {
        while (!isLabelSeparator(toks, j)) {
          thisLabels.add(labelForTok(toks[j], j));
          j++;
        }
        if ((j < toks.length) && toks[j].equals("----")) j++;
        lbls[i] = new Labels((Label[]) thisLabels.toArray(new Label[thisLabels.size()]));
      }

      int maxFeatureIdx = (labelsAtEnd) ? toks.length - numLabels : toks.length;

      String text = "*???*";
      if (featuresIncludeToken) {
        if (j < maxFeatureIdx) {
          text = toks[j++];
        }
      }

      int start = buf.length();
      buf.append(text);
      int end = buf.length();
      buf.append(" ");

      StringSpan span = new StringSpan(buf, start, end);

      while (j < maxFeatureIdx) {
        span.setFeatureValue(toks[j].intern(), 1.0);
        j++;
      }

      if (includeTokenText) {
        span.setFeatureValue((textFeaturePrefix + text).intern(), 1.0);
      }

      if (labelsAtEnd) {
        int firstLblIdx = j;
        while (j < toks.length) {
          thisLabels.add(labelForTok(toks[j], j - firstLblIdx));
          j++;
        }
        lbls[i] = new Labels((Label[]) thisLabels.toArray(new Label[thisLabels.size()]));
      }

      spans[i] = span;
    }

    StringTokenization tokenization = new StringTokenization(buf);
    tokenization.addAll(spans);
    carrier.setData(tokenization);

    carrier.setTarget(new LabelsSequence(lbls));

    return carrier;
  }