@Test
  public void testFieldCoercion() throws IOException {
    // 75.185.76.245 - - [01/Sep/2007:00:01:03 +0000] "POST /mt-tb.cgi/235 HTTP/1.1" 403 174 "-"
    // "Opera/9.10 (Windows NT 5.1; U; ru)" "-"

    DateType dateType = new DateType(TestConstants.APACHE_DATE_FORMAT);

    Type[] types =
        new Type[] {
          String.class, // ip
          String.class, // -
          String.class, // -
          dateType, // date
          String.class, // request
          int.class, // code
          long.class, // bytes
          String.class, // -
          String.class, // agent
          String.class // -
        };

    Fields fields =
        new Fields(
            "ip", "client", "user", "date", "request", "code", "bytes", "referrer", "agent", "na");

    fields = fields.applyTypes(types);

    Tap input =
        getPlatform()
            .getDelimitedFile(
                fields, true, true, ",", "\"", null, inputFileApacheClean, SinkMode.KEEP);
    Tap output =
        getPlatform()
            .getDelimitedFile(
                fields,
                true,
                true,
                ",",
                "\"",
                null,
                getOutputPath(getTestName()),
                SinkMode.REPLACE);

    Pipe pipe = new Pipe("pipe");

    pipe =
        new Each(
            pipe,
            new Fields("date"),
            AssertionLevel.STRICT,
            new AssertExpression("date instanceof Long", Object.class));

    Flow flow = getPlatform().getFlowConnector().connect(input, output, pipe);

    flow.complete();

    validateLength(flow, 9, 10);
  }
  static List<String> asStrings(Fields fields) {
    if (fields == null || !fields.isDefined()) {
      // use auto-generated name
      return Collections.emptyList();
    }

    int size = fields.size();
    List<String> names = new ArrayList<String>(size);
    for (int fieldIndex = 0; fieldIndex < size; fieldIndex++) {
      names.add(fields.get(fieldIndex).toString());
    }

    return names;
  }
    @Override
    public void notifyWriteSpillBegin(Spillable spillable, int spillSize, String spillReason) {
      int numFiles = spillable.spillCount();

      if (numFiles % 10 == 0) {
        LOG.info(
            "spilling group: {}, on grouping: {}, num times: {}, with reason: {}",
            new Object[] {
              joinField.printVerbose(), spillable.getGrouping().print(), numFiles + 1, spillReason
            });

        Runtime runtime = Runtime.getRuntime();
        long freeMem = runtime.freeMemory() / 1024 / 1024;
        long maxMem = runtime.maxMemory() / 1024 / 1024;
        long totalMem = runtime.totalMemory() / 1024 / 1024;

        LOG.info(
            "mem on spill (mb), free: " + freeMem + ", total: " + totalMem + ", max: " + maxMem);
      }

      LOG.info("spilling {} tuples in list to file number {}", spillSize, numFiles + 1);

      flowProcess.increment(Spill.Num_Spills_Written, 1);
      flowProcess.increment(Spill.Num_Tuples_Spilled, spillSize);
    }
Esempio n. 4
0
  /**
   * Create a new datum with field names defined by <fields>, and field values contained in <tuple>
   *
   * <p>WARNING - <tuple> will be kept as the data container, so don't call this with a tuple
   * provided by a Cascading operation/iterator, as those get reused.
   *
   * @param fields Names of fields
   * @param tuple Data for the datum
   */
  public BaseDatum(Fields fields, Tuple tuple) {
    if (fields.size() != tuple.size()) {
      throw new IllegalArgumentException(
          "Size of fields must be the same as the size of the tuple: " + fields + "/" + tuple);
    }

    _tupleEntry = new TupleEntry(fields, tuple);
  }
  public Pipe addAssembly(
      String value, Map<String, String> subParams, Map<String, Pipe> pipes, Pipe pipe) {
    Fields fields = asFields(getString(subParams, "args", null));

    if (fields == null) fields = Fields.FIRST;

    return new Each(pipe, fields, new ExpressionFunction(Fields.size(1), value, String.class));
  }
Esempio n. 6
0
  private static Fields determineGroupFields(List<CombinerDefinition> combinerDefinitions) {
    Fields summedGroupFields = new Fields(MultiCombiner.ID_FIELD);

    for (CombinerDefinition def : combinerDefinitions) {
      summedGroupFields = Fields.merge(summedGroupFields, def.getGroupFields());
    }
    return summedGroupFields;
  }
    static Tuple coerceToString(SinkCall<?, ?> sinkCall) {
      TupleEntry entry = sinkCall.getOutgoingEntry();
      Fields fields = entry.getFields();
      Tuple tuple = entry.getTuple();

      if (fields.hasTypes()) {
        Type types[] = new Type[fields.size()];
        for (int index = 0; index < fields.size(); index++) {
          Type type = fields.getType(index);
          if (type instanceof CoercibleType<?>) {
            types[index] = String.class;
          } else {
            types[index] = type;
          }
        }

        tuple = entry.getCoercedTuple(types);
      }
      return tuple;
    }
Esempio n. 8
0
  public static Fields getInputFields(List<CombinerDefinition> combinerDefinitions) {
    Fields summedInputFields = new Fields();

    for (CombinerDefinition combinerDefinition : combinerDefinitions) {
      summedInputFields =
          Fields.merge(
              summedInputFields,
              combinerDefinition.getGroupFields(),
              combinerDefinition.getInputFields());
    }
    return summedInputFields;
  }
Esempio n. 9
0
  public static Fields getIntermediateFields(List<CombinerDefinition> combinerDefinitions) {
    Fields summedIntermediateFields = new Fields(MultiCombiner.ID_FIELD);

    for (CombinerDefinition combinerDefinition : combinerDefinitions) {
      summedIntermediateFields =
          Fields.merge(
              summedIntermediateFields,
              combinerDefinition.getGroupFields(),
              combinerDefinition.getIntermediateFields());
    }
    return summedIntermediateFields;
  }
Esempio n. 10
0
  @Override
  public boolean equals(Object object) {
    if (this == object) return true;
    if (!(object instanceof BaseOperation)) return false;

    BaseOperation that = (BaseOperation) object;

    if (numArgs != that.numArgs) return false;
    if (fieldDeclaration != null
        ? !fieldDeclaration.equals(that.fieldDeclaration)
        : that.fieldDeclaration != null) return false;

    return true;
  }
Esempio n. 11
0
  /** Contributed by gicode */
  @Test
  public void testParserDeclared5() throws IOException {
    RegexParser splitter = new RegexParser(new Fields("bar"), "^GET /foo\\?bar=([^\\&]+)&");
    Tuple arguments = new Tuple("GET /foo?bar=z123&baz=2");
    Fields resultFields = Fields.size(1);

    TupleListCollector collector = invokeFunction(splitter, arguments, resultFields);

    assertEquals("wrong size", 1, collector.size());

    Iterator<Tuple> iterator = collector.iterator();

    Tuple tuple = iterator.next();

    assertEquals("wrong tuple size", 1, tuple.size());
    assertEquals("not equal: tuple.get(0)", "z123", tuple.getObject(0));
  }
Esempio n. 12
0
  @Test
  public void testParserDeclared6() throws IOException {
    RegexParser splitter = new RegexParser(new Fields("lhs"), "(\\S+)\\s+\\S+", new int[] {1});
    Tuple arguments = new Tuple("foo\tbar");
    Fields resultFields = Fields.size(1);

    TupleListCollector collector = invokeFunction(splitter, arguments, resultFields);

    assertEquals("wrong size", 1, collector.size());

    Iterator<Tuple> iterator = collector.iterator();

    Tuple tuple = iterator.next();

    assertEquals("wrong tupel size", 1, tuple.size());
    assertEquals("not equal: tuple.get(0)", "foo", tuple.getObject(0));
  }
Esempio n. 13
0
  public static <T> void populateOutputTupleEntry(
      CombinerDefinition<T> definition, TupleEntry output, Tuple resultTuple) {
    // set the ID so we can differentiate later
    output.setRaw(MultiCombiner.ID_FIELD, definition.getId());

    // our tuples are of the form groupFields+outputFields, set the TupleEntry fields appropriately
    Fields groupFields = definition.getGroupFields();
    int index = 0;
    for (int i = 0; i < groupFields.size(); i++) {
      output.setRaw(groupFields.get(i), resultTuple.getObject(index));
      index++;
    }
    Fields outputFields = definition.getOutputFields();
    for (int i = 0; i < outputFields.size(); i++) {
      output.setRaw(outputFields.get(i), resultTuple.getObject(index));
      index++;
    }
  }
Esempio n. 14
0
  /**
   * Constructor Unique creates a new Unique instance.
   *
   * @param name of type String
   * @param pipes of type Pipe[]
   * @param uniqueFields of type Fields
   * @param threshold of type int
   */
  @ConstructorProperties({"name", "pipes", "uniqueFields", "include", "threshold"})
  public Unique(String name, Pipe[] pipes, Fields uniqueFields, Include include, int threshold) {
    super(pipes);

    if (uniqueFields == null) throw new IllegalArgumentException("uniqueFields may not be null");

    Pipe[] filters = new Pipe[pipes.length];

    TupleHasher tupleHasher = null;
    Comparator[] comparators = uniqueFields.getComparators();

    if (!TupleHasher.isNull(comparators)) tupleHasher = new TupleHasher(null, comparators);

    FilterPartialDuplicates partialDuplicates =
        new FilterPartialDuplicates(include, threshold, tupleHasher);

    for (int i = 0; i < filters.length; i++)
      filters[i] = new Each(pipes[i], uniqueFields, partialDuplicates);

    Pipe pipe = new GroupBy(name, filters, uniqueFields);
    pipe = new Every(pipe, Fields.ALL, new FirstNBuffer(), Fields.RESULTS);

    setTails(pipe);
  }
Esempio n. 15
0
  public void testStop() throws Exception {
    if (!new File(inputFileLower).exists()) fail("data file not found");

    copyFromLocal(inputFileLower);
    copyFromLocal(inputFileUpper);

    Tap sourceLower = new Hfs(new TextLine(new Fields("offset", "line")), inputFileLower);
    Tap sourceUpper = new Hfs(new TextLine(new Fields("offset", "line")), inputFileUpper);

    Map sources = new HashMap();

    sources.put("lower", sourceLower);
    sources.put("upper", sourceUpper);

    Function splitter = new RegexSplitter(new Fields("num", "char"), " ");

    // using null pos so all fields are written
    Tap sink = new Hfs(new TextLine(), outputPath + "/stopped/", true);

    Pipe pipeLower = new Each(new Pipe("lower"), new Fields("line"), splitter);

    pipeLower = new GroupBy(pipeLower, new Fields("num"));

    Pipe pipeUpper = new Each(new Pipe("upper"), new Fields("line"), splitter);

    pipeUpper = new GroupBy(pipeUpper, new Fields("num"));

    Pipe splice =
        new CoGroup(pipeLower, new Fields("num"), pipeUpper, new Fields("num"), Fields.size(4));

    Flow flow = new FlowConnector(getProperties()).connect(sources, sink, splice);

    //    countFlow.writeDOT( "stopped.dot" );

    LockingFlowListener listener = new LockingFlowListener();

    flow.addListener(listener);

    System.out.println("calling start");
    flow.start();

    assertTrue("did not start", listener.started.tryAcquire(60, TimeUnit.SECONDS));

    while (true) {
      System.out.println("testing if running");
      Thread.sleep(1000);

      Map<String, Callable<Throwable>> map = flow.getJobsMap();

      if (map == null || map.values().size() == 0) continue;

      if (((FlowStepJob) map.values().iterator().next()).wasStarted()) break;
    }

    System.out.println("calling stop");

    flow.stop();

    assertTrue("did not stop", listener.stopped.tryAcquire(60, TimeUnit.SECONDS));
    assertTrue("did not complete", listener.completed.tryAcquire(60, TimeUnit.SECONDS));
  }
Esempio n. 16
0
 protected void validateFields(Fields superFields, Fields myFields) {
   if (!superFields.contains(myFields)) {
     throw new IllegalArgumentException("Fields passed to constructor don't contain " + myFields);
   }
 }
Esempio n. 17
0
 /**
  * Create an empty datum with field names defined by <fields>
  *
  * @param fields Names of fields
  */
 public BaseDatum(Fields fields) {
   this(new TupleEntry(fields, Tuple.size(fields.size())));
 }
Esempio n. 18
0
 private TupleEntry getEntry(Tuple tuple) {
   return new TupleEntry(Fields.size(tuple.size()), tuple);
 }
Esempio n. 19
0
  public static void validate(File solrCoreDir, String dataDirPropertyName, Fields schemeFields)
      throws IOException {

    // Verify solrHomeDir exists
    if (!solrCoreDir.exists() || !solrCoreDir.isDirectory()) {
      throw new TapException("Solr core directory doesn't exist: " + solrCoreDir);
    }

    File tmpSolrHome = makeTempSolrHome(solrCoreDir);

    // Set up a temp location for Solr home, where we're write out a synthetic solr.xml
    // that references the core directory.
    String coreName = solrCoreDir.getName();
    String corePath = solrCoreDir.getAbsolutePath();
    String solrXmlContent =
        String.format(
            "<solr><cores><core name=\"%s\" instanceDir=\"%s\"></core></cores></solr>",
            coreName, corePath);
    File solrXmlFile = new File(tmpSolrHome, "solr.xml");
    FileUtils.write(solrXmlFile, solrXmlContent);

    // Set up a temp location for data, so when we instantiate the coreContainer,
    // we don't pollute the solr home with a /data sub-dir.
    String tmpFolder = System.getProperty("java.io.tmpdir");
    File tmpDataDir = new File(tmpFolder, UUID.randomUUID().toString());
    tmpDataDir.mkdir();

    System.setProperty("solr.solr.home", tmpSolrHome.getAbsolutePath());
    System.setProperty(dataDirPropertyName, tmpDataDir.getAbsolutePath());
    System.setProperty(
        "enable.special-handlers", "false"); // All we need is the update request handler
    System.setProperty(
        "enable.cache-warming", "false"); // We certainly don't need to warm the cache

    CoreContainer.Initializer initializer = new CoreContainer.Initializer();
    CoreContainer coreContainer = null;

    try {
      coreContainer = initializer.initialize();
      Collection<SolrCore> cores = coreContainer.getCores();
      SolrCore core = null;

      if (cores.size() == 0) {
        throw new TapException("No Solr cores are available");
      } else if (cores.size() > 1) {
        throw new TapException("Only one Solr core is supported");
      } else {
        core = cores.iterator().next();
      }

      IndexSchema schema = core.getSchema();
      Map<String, SchemaField> solrFields = schema.getFields();
      Set<String> schemeFieldnames = new HashSet<String>();

      for (int i = 0; i < schemeFields.size(); i++) {
        String fieldName = schemeFields.get(i).toString();
        if (!solrFields.containsKey(fieldName)) {
          throw new TapException("Sink field name doesn't exist in Solr schema: " + fieldName);
        }

        schemeFieldnames.add(fieldName);
      }

      for (String solrFieldname : solrFields.keySet()) {
        SchemaField solrField = solrFields.get(solrFieldname);
        if (solrField.isRequired() && !schemeFieldnames.contains(solrFieldname)) {
          throw new TapException("No sink field name for required Solr field: " + solrFieldname);
        }
      }
    } finally {
      if (coreContainer != null) {
        coreContainer.shutdown();
      }
    }
  }
Esempio n. 20
0
 @Override
 public int hashCode() {
   int result = fieldDeclaration != null ? fieldDeclaration.hashCode() : 0;
   result = 31 * result + numArgs;
   return result;
 }
Esempio n. 21
0
 /**
  * Selects and returns the first argument Tuple encountered.
  *
  * @param fieldDeclaration of type Fields
  */
 @ConstructorProperties({"fieldDeclaration"})
 public First(Fields fieldDeclaration) {
   super(fieldDeclaration.size(), fieldDeclaration);
 }
Esempio n. 22
0
  public void failingListenerTest(FailingFlowListener.OnFail onFail) throws Exception {
    if (!new File(inputFileLower).exists()) fail("data file not found");

    copyFromLocal(inputFileLower);
    copyFromLocal(inputFileUpper);

    Tap sourceLower = new Hfs(new TextLine(new Fields("offset", "line")), inputFileLower);
    Tap sourceUpper = new Hfs(new TextLine(new Fields("offset", "line")), inputFileUpper);

    Map sources = new HashMap();

    sources.put("lower", sourceLower);
    sources.put("upper", sourceUpper);

    Function splitter = new RegexSplitter(new Fields("num", "char"), " ");

    // using null pos so all fields are written
    Tap sink = new Hfs(new TextLine(), outputPath + "/stopped/", true);

    Pipe pipeLower = new Each(new Pipe("lower"), new Fields("line"), splitter);

    if (onFail == FailingFlowListener.OnFail.THROWABLE) {
      pipeLower =
          new Each(
              pipeLower,
              new Debug() {
                @Override
                public boolean isRemove(FlowProcess flowProcess, FilterCall filterCall) {
                  throw new RuntimeException("failing inside pipe assembly intentionally");
                }
              });
    }

    pipeLower = new GroupBy(pipeLower, new Fields("num"));

    Pipe pipeUpper = new Each(new Pipe("upper"), new Fields("line"), splitter);

    pipeUpper = new GroupBy(pipeUpper, new Fields("num"));

    Pipe splice =
        new CoGroup(pipeLower, new Fields("num"), pipeUpper, new Fields("num"), Fields.size(4));

    Flow flow = new FlowConnector(getProperties()).connect(sources, sink, splice);

    //    countFlow.writeDOT( "stopped.dot" );

    FailingFlowListener listener = new FailingFlowListener(onFail);

    flow.addListener(listener);

    System.out.println("calling start");
    flow.start();

    assertTrue("did not start", listener.started.tryAcquire(120, TimeUnit.SECONDS));

    if (onFail == FailingFlowListener.OnFail.STOPPING) {
      while (true) {
        System.out.println("testing if running");
        Thread.sleep(1000);

        Map<String, Callable<Throwable>> map = flow.getJobsMap();

        if (map == null || map.values().size() == 0) continue;

        if (((FlowStepJob) map.values().iterator().next()).wasStarted()) break;
      }

      System.out.println("calling stop");

      flow.stop();
    }

    assertTrue("did not complete", listener.completed.tryAcquire(120, TimeUnit.SECONDS));
    assertTrue("did not stop", listener.stopped.tryAcquire(120, TimeUnit.SECONDS));

    try {
      flow.complete();
      fail("did not rethrow exception from listener");
    } catch (Exception exception) {
      // ignore
    }
  }