/** * map an xml columns to columns * * @param xmlColumns xml column * @param typesBelongingCompositeTypeForComparatorType * @return columns */ private List<ColumnModel> mapXmlColumnsToColumnsModel( List<ColumnMetadata> columnMetaData, List<Column> xmlColumns, ComparatorType columnNameComparatorType, GenericTypeEnum[] typesBelongingCompositeTypeForComparatorType, ComparatorType defaultColumnValueType) { List<ColumnModel> columnsModel = new ArrayList<ColumnModel>(); for (Column xmlColumn : xmlColumns) { ColumnMetadata assocMetaData = null; for (ColumnMetadata tmpColumnMetaData : columnMetaData) { if (tmpColumnMetaData.getName().equals(xmlColumn.getName())) { assocMetaData = tmpColumnMetaData; } } columnsModel.add( mapXmlColumnToColumnModel( assocMetaData, xmlColumn, columnNameComparatorType, typesBelongingCompositeTypeForComparatorType, defaultColumnValueType)); } return columnsModel; }
/** * If the key is to be associated with a valid value, a mutation is created for it with the given * table and columns. In the event the value in the column is missing (i.e., null), then it is * marked for {@link Deletion}. Similarly, if the entire value for a key is missing (i.e., null), * then the entire key is marked for {@link Deletion}. * * @param keyColumns the key to write. * @param values the values to write. * @throws IOException */ @Override public void write(Map<String, ByteBuffer> keyColumns, List<ByteBuffer> values) throws IOException { TokenRange range = ringCache.getRange(getPartitionKey(keyColumns)); // get the client for the given range, or create a new one final InetAddress address = ringCache.getEndpoints(range).get(0); RangeClient client = clients.get(address); if (client == null) { // haven't seen keys for this range: create new client client = new RangeClient(ringCache.getEndpoints(range)); client.start(); clients.put(address, client); } // add primary key columns to the bind variables List<ByteBuffer> allValues = new ArrayList<ByteBuffer>(values); for (ColumnMetadata column : partitionKeyColumns) allValues.add(keyColumns.get(column.getName())); for (ColumnMetadata column : clusterColumns) allValues.add(keyColumns.get(column.getName())); client.put(allValues); if (progressable != null) progressable.progress(); if (context != null) HadoopCompat.progress(context); }
@Override @SuppressWarnings("unchecked") protected void setup(Reducer.Context context) throws IOException, InterruptedException { this.context = context; String settingsStr = context.getConfiguration().get(ParameterProcessing.SETTINGS_STR); Settings settings = Settings.loadFromString(settingsStr); Settings.setSettings(settings); String projectStr = context.getConfiguration().get(ParameterProcessing.PROJECT); Project project = Project.loadFromString(projectStr); if (project.isEnvHadoop()) { String metadataFileContents = context.getConfiguration().get(ParameterProcessing.METADATA_FILE); new File(ColumnMetadata.metadataNamesFile).getParentFile().mkdirs(); Files.write(metadataFileContents.getBytes(), new File(ColumnMetadata.metadataNamesFile)); } columnMetadata = new ColumnMetadata(); String fileSeparatorStr = project.getFieldSeparator(); char fieldSeparatorChar = Delimiter.getDelim(fileSeparatorStr); columnMetadata.setFieldSeparator(String.valueOf(fieldSeparatorChar)); columnMetadata.setAllMetadata(project.getMetadataCollect()); // write standard metadata fields context.write(null, new Text(columnMetadata.delimiterSeparatedHeaders())); zipFileWriter.setup(); zipFileWriter.openZipForWriting(); luceneIndex = new LuceneIndex(settings.getLuceneIndexDir(), project.getProjectCode(), null); luceneIndex.init(); }
/** add where clauses for partition keys and cluster columns */ private String appendKeyWhereClauses(String cqlQuery) { String keyWhereClause = ""; for (ColumnMetadata partitionKey : partitionKeyColumns) keyWhereClause += String.format( "%s = ?", keyWhereClause.isEmpty() ? quote(partitionKey.getName()) : (" AND " + quote(partitionKey.getName()))); for (ColumnMetadata clusterColumn : clusterColumns) keyWhereClause += " AND " + quote(clusterColumn.getName()) + " = ?"; return cqlQuery + " WHERE " + keyWhereClause; }
private ColumnMetadataModel mapXmlColumnMetadataToColumMetadataModel( ColumnMetadata xmlColumnMetadata) { ColumnMetadataModel columnMetadata = new ColumnMetadataModel(); columnMetadata.setColumnName(xmlColumnMetadata.getName()); columnMetadata.setValidationClass( ComparatorType.getByClassName(xmlColumnMetadata.getValidationClass().value())); if (xmlColumnMetadata.getIndexType() != null) { columnMetadata.setColumnIndexType( ColumnIndexType.valueOf(xmlColumnMetadata.getIndexType().value())); } columnMetadata.setIndexName(xmlColumnMetadata.getIndexName()); return columnMetadata; }
@Test public void should_iterate_row_with_metadata() throws IOException { // given String[] columnNames = new String[] { "id", "firstname", "lastname", "state", "registration", "city", "birth", "nbCommands", "avgAmount" }; final InputStream input = this.getClass().getResourceAsStream("dataSetRowMetadata.json"); final ObjectMapper mapper = builder.build(); try (JsonParser parser = mapper.getFactory().createParser(input)) { final DataSet dataSet = mapper.readerFor(DataSet.class).readValue(parser); final Iterator<DataSetRow> iterator = dataSet.getRecords().iterator(); List<ColumnMetadata> actualColumns = new ArrayList<>(); int recordCount = 0; while (iterator.hasNext()) { final DataSetRow next = iterator.next(); actualColumns = next.getRowMetadata().getColumns(); assertThat(actualColumns, not(empty())); recordCount++; } // then assertEquals(10, recordCount); for (int i = 0; i < actualColumns.size(); i++) { final ColumnMetadata column = actualColumns.get(i); assertEquals(columnNames[i], column.getId()); } } catch (Exception e) { throw new TDPException(CommonErrorCodes.UNABLE_TO_PARSE_JSON, e); } }
/** * map an xml column to a column * * @param xmlColumn xml column * @param typesBelongingCompositeTypeForComparatorType * @return column */ private ColumnModel mapXmlColumnToColumnModel( ColumnMetadata metaData, Column xmlColumn, ComparatorType comparatorType, GenericTypeEnum[] typesBelongingCompositeTypeForComparatorType, ComparatorType defaultColumnValueType) { ColumnModel columnModel = new ColumnModel(); if (comparatorType == null) { columnModel.setName(new GenericType(xmlColumn.getName(), GenericTypeEnum.BYTES_TYPE)); } else if (ComparatorType.COMPOSITETYPE.getTypeName().equals(comparatorType.getTypeName())) { /* composite type */ try { columnModel.setName( new GenericType( StringUtils.split(xmlColumn.getName(), ":"), typesBelongingCompositeTypeForComparatorType)); } catch (IllegalArgumentException e) { throw new ParseException( xmlColumn.getName() + " doesn't fit with the schema declaration of your composite type"); } } else { /* simple type */ columnModel.setName( new GenericType( xmlColumn.getName(), GenericTypeEnum.fromValue(comparatorType.getTypeName()))); } if (defaultColumnValueType != null && ComparatorType.COUNTERTYPE.getClassName().equals(defaultColumnValueType.getClassName()) && TypeExtractor.containFunctions(xmlColumn.getValue())) { throw new ParseException("Impossible to override Column value into a Counter column family"); } GenericType columnValue = null; if (metaData != null) { GenericTypeEnum genTypeEnum = GenericTypeEnum.valueOf(metaData.getValidationClass().name()); columnValue = new GenericType(xmlColumn.getValue(), genTypeEnum); } else { columnValue = TypeExtractor.extract(xmlColumn.getValue(), defaultColumnValueType); } columnModel.setValue(columnValue); String timestamp = xmlColumn.getTimestamp(); if (timestamp != null) { columnModel.setTimestamp(Long.valueOf(timestamp)); } else { columnModel.setTimestamp(null); } return columnModel; }
@Override @SuppressWarnings("unchecked") protected void cleanup(Reducer.Context context) throws IOException, InterruptedException { if (!Project.getProject().isMetadataCollectStandard()) { // write summary headers with all metadata, but for standard metadata don't write the last // line context.write(new Text("Hash"), new Text(columnMetadata.delimiterSeparatedHeaders())); } zipFileWriter.closeZip(); if (Project.getProject().isLuceneIndexEnabled()) { mergeLuceneIndex(); } Project project = Project.getProject(); if (project.isEnvHadoop()) { String outputPath = Project.getProject().getProperty(ParameterProcessing.OUTPUT_DIR_HADOOP); String zipFileName = zipFileWriter.getZipFileName(); if (project.isFsHdfs()) { String cmd = "hadoop fs -copyFromLocal " + zipFileName + " " + outputPath + File.separator + context.getTaskAttemptID() + ".zip"; OsUtil.runCommand(cmd); } else if (project.isFsS3()) { S3Agent s3agent = new S3Agent(); String run = project.getRun(); if (!run.isEmpty()) { run = run + "/"; } String s3key = project.getProjectCode() + File.separator + "output/" + run + "results/" + context.getTaskAttemptID() + ".zip"; // Keep updating the hadoop progress int refreshInterval = 60000; Timer timer = new Timer(refreshInterval, this); timer.start(); s3agent.putFileInS3(zipFileName, s3key); timer.stop(); } } Stats.getInstance().setJobFinished(); }
@Test public void testRead1() throws Exception { DataSet dataSet = from(this.getClass().getResourceAsStream("test1.json")); assertNotNull(dataSet); final DataSetMetadata metadata = dataSet.getMetadata(); assertEquals("410d2196-8f90-478f-a817-7e8b6694ac91", metadata.getId()); assertEquals("test", metadata.getName()); assertEquals("anonymousUser", metadata.getAuthor()); assertEquals(2, metadata.getContent().getNbRecords()); assertEquals(1, metadata.getContent().getNbLinesInHeader()); assertEquals(0, metadata.getContent().getNbLinesInFooter()); final SimpleDateFormat dateFormat = new SimpleDateFormat("MM-dd-yyyy HH:mm"); dateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); Date expectedDate = dateFormat.parse("02-17-2015 09:02"); assertEquals(expectedDate, new Date(metadata.getCreationDate())); List<ColumnMetadata> columns = dataSet.getMetadata().getRowMetadata().getColumns(); assertEquals(6, columns.size()); ColumnMetadata firstColumn = columns.get(0); assertEquals("0001", firstColumn.getId()); assertEquals("id", firstColumn.getName()); assertEquals("integer", firstColumn.getType()); assertEquals(20, firstColumn.getQuality().getEmpty()); assertEquals(26, firstColumn.getQuality().getInvalid()); assertEquals(54, firstColumn.getQuality().getValid()); ColumnMetadata lastColumn = columns.get(5); assertEquals("0007", lastColumn.getId()); assertEquals("string", lastColumn.getType()); assertEquals(8, lastColumn.getQuality().getEmpty()); assertEquals(25, lastColumn.getQuality().getInvalid()); assertEquals(67, lastColumn.getQuality().getValid()); }
protected void processMap(MapWritable value) throws IOException, InterruptedException { columnMetadata.reinit(); ++outputFileCount; DocumentMetadata allMetadata = getAllMetadata(value); Metadata standardMetadata = getStandardMetadata(allMetadata, outputFileCount); columnMetadata.addMetadata(standardMetadata); columnMetadata.addMetadata(allMetadata); // documents other than the first one in this loop are either duplicates or attachments if (first) { masterOutputFileCount = outputFileCount; } else { if (allMetadata.hasParent()) { columnMetadata.addMetadataValue( DocumentMetadataKeys.ATTACHMENT_PARENT, UPIFormat.format(masterOutputFileCount)); } else { columnMetadata.addMetadataValue( DocumentMetadataKeys.MASTER_DUPLICATE, UPIFormat.format(masterOutputFileCount)); } } // String uniqueId = allMetadata.getUniqueId(); String originalFileName = new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName(); // add the text to the text folder String documentText = allMetadata.get(DocumentMetadataKeys.DOCUMENT_TEXT); String textEntryName = ParameterProcessing.TEXT + "/" + UPIFormat.format(outputFileCount) + "_" + originalFileName + ".txt"; if (textEntryName != null) { zipFileWriter.addTextFile(textEntryName, documentText); } columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_TEXT, textEntryName); // add the native file to the native folder String nativeEntryName = ParameterProcessing.NATIVE + "/" + UPIFormat.format(outputFileCount) + "_" + originalFileName; BytesWritable bytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE)); if (bytesWritable != null) { // some large exception files are not passed zipFileWriter.addBinaryFile( nativeEntryName, bytesWritable.getBytes(), bytesWritable.getLength()); logger.trace("Processing file: {}", nativeEntryName); } columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_NATIVE, nativeEntryName); // add the pdf made from native to the PDF folder String pdfNativeEntryName = ParameterProcessing.PDF_FOLDER + "/" + UPIFormat.format(outputFileCount) + "_" + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName() + ".pdf"; BytesWritable pdfBytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE_AS_PDF)); if (pdfBytesWritable != null) { zipFileWriter.addBinaryFile( pdfNativeEntryName, pdfBytesWritable.getBytes(), pdfBytesWritable.getLength()); logger.trace("Processing file: {}", pdfNativeEntryName); } processHtmlContent(value, allMetadata, UPIFormat.format(outputFileCount)); // add exception to the exception folder String exception = allMetadata.get(DocumentMetadataKeys.PROCESSING_EXCEPTION); if (exception != null) { String exceptionEntryName = "exception/" + UPIFormat.format(outputFileCount) + "_" + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName(); if (bytesWritable != null) { zipFileWriter.addBinaryFile( exceptionEntryName, bytesWritable.getBytes(), bytesWritable.getLength()); } columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_EXCEPTION, exceptionEntryName); } // write this all to the reduce map // context.write(new Text(outputKey), new Text(columnMetadata.delimiterSeparatedValues())); // drop the key altogether, because it messes up the format - but put it in the value // TODO use NullWritable if (OsUtil.isNix()) { context.write(null, new Text(columnMetadata.delimiterSeparatedValues())); } // prepare for the next file with the same key, if there is any first = false; }