private static void testConcatenateParameter( final int constantNumber, int totalCount, AttributeMatcherType matchAlgorithm, String separator) { Map<String, ValueGenerator> generators = new HashMap<String, ValueGenerator>(); generators.put( "name", new ValueGenerator() { int index = 0; @Override public int getColumnIndex() { return index; } @Override public String newValue() { return CONSTANTS[index++ % constantNumber]; } }); RecordGenerator recordGenerator = new RecordGenerator(); recordGenerator.setMatchKeyMap(generators); Iterator<Record> iterator = new RecordIterator(totalCount, recordGenerator); MatchMergeAlgorithm algorithm = MFB.build( new AttributeMatcherType[] {matchAlgorithm}, new String[] {""}, new float[] {1}, 0, new SurvivorShipAlgorithmEnum[] {SurvivorShipAlgorithmEnum.CONCATENATE}, new String[] {separator}, new double[] {1}, new IAttributeMatcher.NullOption[] {IAttributeMatcher.NullOption.nullMatchAll}, new SubString[] {SubString.NO_SUBSTRING}, "MFB"); List<Record> mergedRecords = algorithm.execute(iterator); assertEquals(constantNumber, mergedRecords.size()); int i = 0; for (Record mergedRecord : mergedRecords) { int relatedIdCount = mergedRecord.getRelatedIds().size(); int length = separator == null ? 0 : separator.length(); int spaceCount = ((relatedIdCount - 1) * length); List<Attribute> attributes = mergedRecord.getAttributes(); assertEquals(Math.round(totalCount / constantNumber), relatedIdCount); assertEquals(1, attributes.size()); Attribute attribute = attributes.get(0); assertEquals( (CONSTANTS[i].length() * relatedIdCount) + spaceCount, attribute.getValue().length()); i++; } }
private static void testConstant( final int constantNumber, int totalCount, AttributeMatcherType matchAlgorithm) { Map<String, ValueGenerator> generators = new HashMap<String, ValueGenerator>(); generators.put( "name", new ValueGenerator() { int index = 0; @Override public int getColumnIndex() { return index; } @Override public String newValue() { return CONSTANTS[index++ % constantNumber]; } }); RecordGenerator recordGenerator = new RecordGenerator(); recordGenerator.setMatchKeyMap(generators); Iterator<Record> iterator = new RecordIterator(totalCount, recordGenerator); MatchMergeAlgorithm algorithm = MFB.build( new AttributeMatcherType[] {matchAlgorithm}, new String[] {""}, new float[] {1}, 0, new SurvivorShipAlgorithmEnum[] {SurvivorShipAlgorithmEnum.LONGEST}, new String[] {""}, new double[] {1}, new IAttributeMatcher.NullOption[] {IAttributeMatcher.NullOption.nullMatchAll}, new SubString[] {SubString.NO_SUBSTRING}, "MFB"); List<Record> mergedRecords = algorithm.execute(iterator); assertEquals(constantNumber, mergedRecords.size()); for (Record mergedRecord : mergedRecords) { assertEquals(Math.round(totalCount / constantNumber), mergedRecord.getRelatedIds().size()); } }
public GeneratorTask(Configuration conf, ParameterSet parameterSet, List<Long> group, String id) { this.parameterSet = parameterSet; this.list = group; this.id = id; generator = new TextRecordGenerator(); if (generator == null) { LOG.error("Cannot initialize the generator " + TextRecordGenerator.class.getName()); System.exit(1); } LOG.debug("InstanceDoc: " + parameterSet.instanceDoc); generator.loadGenerator(parameterSet.instanceDoc); dateTime = new DateTime(); formatter = DateTimeFormat.forPattern("yyyyMMdd-hhmmss"); conf.set("dfs.replication", "" + parameterSet.replicaNum); if (parameterSet.codec != null) { conf.set("generator.codec", parameterSet.codec); } sink = new FileSystemSink(conf); sink.setOutputFolder(parameterSet.outputDir); computeTotal(); }
private static void testWeight( final int constantNumber, int totalCount, AttributeMatcherType matchAlgorithm) { Map<String, ValueGenerator> generators = new HashMap<String, ValueGenerator>(); generators.put( "name", new ValueGenerator() { int index = 0; @Override public int getColumnIndex() { return index; } @Override public String newValue() { return CONSTANTS[index++ % constantNumber]; } }); // Runs a first match with a weight 1 RecordGenerator recordGenerator = new RecordGenerator(); recordGenerator.setMatchKeyMap(generators); Iterator<Record> iterator = new RecordIterator(totalCount, recordGenerator); MatchMergeAlgorithm algorithm = MFB.build( new AttributeMatcherType[] {matchAlgorithm}, new String[] {""}, new float[] {1}, 0, new SurvivorShipAlgorithmEnum[] {SurvivorShipAlgorithmEnum.LONGEST}, new String[] {""}, new double[] {1}, // Mark rule with a weight of 1. new IAttributeMatcher.NullOption[] {IAttributeMatcher.NullOption.nullMatchAll}, new SubString[] {SubString.NO_SUBSTRING}, "MFB"); List<Record> mergedRecords = algorithm.execute(iterator); assertEquals(constantNumber, mergedRecords.size()); long totalConfidence1 = 0; for (Record mergedRecord : mergedRecords) { assertEquals(totalCount / constantNumber, mergedRecord.getRelatedIds().size()); totalConfidence1 += mergedRecord.getConfidence(); } // Runs a second match with a weight 4 iterator = new RecordIterator(totalCount, recordGenerator); algorithm = MFB.build( new AttributeMatcherType[] {matchAlgorithm}, new String[] {""}, new float[] {1}, 0, new SurvivorShipAlgorithmEnum[] {SurvivorShipAlgorithmEnum.LONGEST}, new String[] {""}, new double[] { 4 }, // Mark rule with a weight of 4 -> should not affect overall score since score is // normalized. new IAttributeMatcher.NullOption[] {IAttributeMatcher.NullOption.nullMatchAll}, new SubString[] {SubString.NO_SUBSTRING}, "MFB"); mergedRecords = algorithm.execute(iterator); assertEquals(constantNumber, mergedRecords.size()); long totalConfidence2 = 0; for (Record mergedRecord : mergedRecords) { assertEquals(totalCount / constantNumber, mergedRecord.getRelatedIds().size()); totalConfidence2 += mergedRecord.getConfidence(); } // ... but this shouldn't change the overall score (because score is always between 0 and 1). assertEquals(totalConfidence1, totalConfidence2); }
@Override public void run() { // For streaming generation if (parameterSet.neverStop) { rp = new RecordProducer(); } int i = 0; while (parameterSet.neverStop || i < list.size()) { PrintWriter printWriter = null; OutputStream outputStream = sink.getOutputStream(getFileName(i)); if (outputStream == null) { throw new NullPointerException("null output stream"); } printWriter = new PrintWriter(new OutputStreamWriter(outputStream)); long expectedSize = 0; if (parameterSet.neverStop) { expectedSize = list.get(0); } else { expectedSize = list.get(i); } if (parameterSet.useSizeMeasurement) { expectedSize = expectedSize * 1024 * 1024; } long counter = 0; long size = 0; long recordCounter = 0; long beginTime = System.currentTimeMillis(); long endTime = beginTime; while (true) { try { Record record = generator.generate(); size += record.getLength(); if (!parameterSet.useSizeMeasurement) { counter++; } ++recordCounter; endTime = System.currentTimeMillis(); if ((!parameterSet.useSizeMeasurement && counter >= expectedSize) || (parameterSet.useSizeMeasurement && size >= expectedSize) || (parameterSet.neverStop && (endTime - beginTime >= 10000))) { printWriter.close(); if (rp != null) { rp.put(parameterSet.outputDir + "/" + getFileName(i), new Integer(i).toString()); } LOG.info("############ the time that generate the record: " + (endTime - beginTime)); LOG.info("############ the number of the code is:" + recordCounter); LOG.info( "Finish writing " + getFileName(i) + " with " + ((parameterSet.useSizeMeasurement) ? (size + " bytes") : (counter + " records"))); break; } else { printWriter.println(record.getAsString()); } } catch (Exception e) { LOG.error("", e); } } accumulate(size); lastDone = counter; i++; } isRunning = false; }