Example #1
0
  private static void testConcatenateParameter(
      final int constantNumber,
      int totalCount,
      AttributeMatcherType matchAlgorithm,
      String separator) {
    Map<String, ValueGenerator> generators = new HashMap<String, ValueGenerator>();
    generators.put(
        "name",
        new ValueGenerator() {

          int index = 0;

          @Override
          public int getColumnIndex() {
            return index;
          }

          @Override
          public String newValue() {
            return CONSTANTS[index++ % constantNumber];
          }
        });
    RecordGenerator recordGenerator = new RecordGenerator();
    recordGenerator.setMatchKeyMap(generators);
    Iterator<Record> iterator = new RecordIterator(totalCount, recordGenerator);
    MatchMergeAlgorithm algorithm =
        MFB.build(
            new AttributeMatcherType[] {matchAlgorithm},
            new String[] {""},
            new float[] {1},
            0,
            new SurvivorShipAlgorithmEnum[] {SurvivorShipAlgorithmEnum.CONCATENATE},
            new String[] {separator},
            new double[] {1},
            new IAttributeMatcher.NullOption[] {IAttributeMatcher.NullOption.nullMatchAll},
            new SubString[] {SubString.NO_SUBSTRING},
            "MFB");
    List<Record> mergedRecords = algorithm.execute(iterator);
    assertEquals(constantNumber, mergedRecords.size());
    int i = 0;
    for (Record mergedRecord : mergedRecords) {
      int relatedIdCount = mergedRecord.getRelatedIds().size();
      int length = separator == null ? 0 : separator.length();
      int spaceCount = ((relatedIdCount - 1) * length);
      List<Attribute> attributes = mergedRecord.getAttributes();
      assertEquals(Math.round(totalCount / constantNumber), relatedIdCount);
      assertEquals(1, attributes.size());
      Attribute attribute = attributes.get(0);
      assertEquals(
          (CONSTANTS[i].length() * relatedIdCount) + spaceCount, attribute.getValue().length());
      i++;
    }
  }
Example #2
0
  private static void testConstant(
      final int constantNumber, int totalCount, AttributeMatcherType matchAlgorithm) {
    Map<String, ValueGenerator> generators = new HashMap<String, ValueGenerator>();
    generators.put(
        "name",
        new ValueGenerator() {

          int index = 0;

          @Override
          public int getColumnIndex() {
            return index;
          }

          @Override
          public String newValue() {
            return CONSTANTS[index++ % constantNumber];
          }
        });
    RecordGenerator recordGenerator = new RecordGenerator();
    recordGenerator.setMatchKeyMap(generators);
    Iterator<Record> iterator = new RecordIterator(totalCount, recordGenerator);
    MatchMergeAlgorithm algorithm =
        MFB.build(
            new AttributeMatcherType[] {matchAlgorithm},
            new String[] {""},
            new float[] {1},
            0,
            new SurvivorShipAlgorithmEnum[] {SurvivorShipAlgorithmEnum.LONGEST},
            new String[] {""},
            new double[] {1},
            new IAttributeMatcher.NullOption[] {IAttributeMatcher.NullOption.nullMatchAll},
            new SubString[] {SubString.NO_SUBSTRING},
            "MFB");
    List<Record> mergedRecords = algorithm.execute(iterator);
    assertEquals(constantNumber, mergedRecords.size());
    for (Record mergedRecord : mergedRecords) {
      assertEquals(Math.round(totalCount / constantNumber), mergedRecord.getRelatedIds().size());
    }
  }
  public GeneratorTask(Configuration conf, ParameterSet parameterSet, List<Long> group, String id) {
    this.parameterSet = parameterSet;
    this.list = group;
    this.id = id;
    generator = new TextRecordGenerator();
    if (generator == null) {
      LOG.error("Cannot initialize the generator " + TextRecordGenerator.class.getName());
      System.exit(1);
    }

    LOG.debug("InstanceDoc: " + parameterSet.instanceDoc);
    generator.loadGenerator(parameterSet.instanceDoc);
    dateTime = new DateTime();
    formatter = DateTimeFormat.forPattern("yyyyMMdd-hhmmss");

    conf.set("dfs.replication", "" + parameterSet.replicaNum);
    if (parameterSet.codec != null) {
      conf.set("generator.codec", parameterSet.codec);
    }
    sink = new FileSystemSink(conf);
    sink.setOutputFolder(parameterSet.outputDir);

    computeTotal();
  }
Example #4
0
  private static void testWeight(
      final int constantNumber, int totalCount, AttributeMatcherType matchAlgorithm) {
    Map<String, ValueGenerator> generators = new HashMap<String, ValueGenerator>();
    generators.put(
        "name",
        new ValueGenerator() {

          int index = 0;

          @Override
          public int getColumnIndex() {
            return index;
          }

          @Override
          public String newValue() {
            return CONSTANTS[index++ % constantNumber];
          }
        });
    // Runs a first match with a weight 1
    RecordGenerator recordGenerator = new RecordGenerator();
    recordGenerator.setMatchKeyMap(generators);
    Iterator<Record> iterator = new RecordIterator(totalCount, recordGenerator);
    MatchMergeAlgorithm algorithm =
        MFB.build(
            new AttributeMatcherType[] {matchAlgorithm},
            new String[] {""},
            new float[] {1},
            0,
            new SurvivorShipAlgorithmEnum[] {SurvivorShipAlgorithmEnum.LONGEST},
            new String[] {""},
            new double[] {1}, // Mark rule with a weight of 1.
            new IAttributeMatcher.NullOption[] {IAttributeMatcher.NullOption.nullMatchAll},
            new SubString[] {SubString.NO_SUBSTRING},
            "MFB");
    List<Record> mergedRecords = algorithm.execute(iterator);
    assertEquals(constantNumber, mergedRecords.size());
    long totalConfidence1 = 0;
    for (Record mergedRecord : mergedRecords) {
      assertEquals(totalCount / constantNumber, mergedRecord.getRelatedIds().size());
      totalConfidence1 += mergedRecord.getConfidence();
    }
    // Runs a second match with a weight 4
    iterator = new RecordIterator(totalCount, recordGenerator);
    algorithm =
        MFB.build(
            new AttributeMatcherType[] {matchAlgorithm},
            new String[] {""},
            new float[] {1},
            0,
            new SurvivorShipAlgorithmEnum[] {SurvivorShipAlgorithmEnum.LONGEST},
            new String[] {""},
            new double[] {
              4
            }, // Mark rule with a weight of 4 -> should not affect overall score since score is
            // normalized.
            new IAttributeMatcher.NullOption[] {IAttributeMatcher.NullOption.nullMatchAll},
            new SubString[] {SubString.NO_SUBSTRING},
            "MFB");
    mergedRecords = algorithm.execute(iterator);
    assertEquals(constantNumber, mergedRecords.size());
    long totalConfidence2 = 0;
    for (Record mergedRecord : mergedRecords) {
      assertEquals(totalCount / constantNumber, mergedRecord.getRelatedIds().size());
      totalConfidence2 += mergedRecord.getConfidence();
    }
    // ... but this shouldn't change the overall score (because score is always between 0 and 1).
    assertEquals(totalConfidence1, totalConfidence2);
  }
  @Override
  public void run() {

    // For streaming generation
    if (parameterSet.neverStop) {
      rp = new RecordProducer();
    }

    int i = 0;
    while (parameterSet.neverStop || i < list.size()) {
      PrintWriter printWriter = null;
      OutputStream outputStream = sink.getOutputStream(getFileName(i));
      if (outputStream == null) {
        throw new NullPointerException("null output stream");
      }
      printWriter = new PrintWriter(new OutputStreamWriter(outputStream));
      long expectedSize = 0;
      if (parameterSet.neverStop) {
        expectedSize = list.get(0);
      } else {
        expectedSize = list.get(i);
      }
      if (parameterSet.useSizeMeasurement) {
        expectedSize = expectedSize * 1024 * 1024;
      }

      long counter = 0;
      long size = 0;

      long recordCounter = 0;
      long beginTime = System.currentTimeMillis();
      long endTime = beginTime;

      while (true) {
        try {
          Record record = generator.generate();
          size += record.getLength();
          if (!parameterSet.useSizeMeasurement) {
            counter++;
          }

          ++recordCounter;
          endTime = System.currentTimeMillis();

          if ((!parameterSet.useSizeMeasurement && counter >= expectedSize)
              || (parameterSet.useSizeMeasurement && size >= expectedSize)
              || (parameterSet.neverStop && (endTime - beginTime >= 10000))) {
            printWriter.close();

            if (rp != null) {
              rp.put(parameterSet.outputDir + "/" + getFileName(i), new Integer(i).toString());
            }

            LOG.info("############ the time that generate the record: " + (endTime - beginTime));
            LOG.info("############ the number of the code is:" + recordCounter);

            LOG.info(
                "Finish writing "
                    + getFileName(i)
                    + " with "
                    + ((parameterSet.useSizeMeasurement)
                        ? (size + " bytes")
                        : (counter + " records")));
            break;
          } else {
            printWriter.println(record.getAsString());
          }
        } catch (Exception e) {
          LOG.error("", e);
        }
      }
      accumulate(size);
      lastDone = counter;

      i++;
    }

    isRunning = false;
  }