/**
   * map an xml columns to columns
   *
   * @param xmlColumns xml column
   * @param typesBelongingCompositeTypeForComparatorType
   * @return columns
   */
  private List<ColumnModel> mapXmlColumnsToColumnsModel(
      List<ColumnMetadata> columnMetaData,
      List<Column> xmlColumns,
      ComparatorType columnNameComparatorType,
      GenericTypeEnum[] typesBelongingCompositeTypeForComparatorType,
      ComparatorType defaultColumnValueType) {
    List<ColumnModel> columnsModel = new ArrayList<ColumnModel>();

    for (Column xmlColumn : xmlColumns) {
      ColumnMetadata assocMetaData = null;
      for (ColumnMetadata tmpColumnMetaData : columnMetaData) {
        if (tmpColumnMetaData.getName().equals(xmlColumn.getName())) {
          assocMetaData = tmpColumnMetaData;
        }
      }
      columnsModel.add(
          mapXmlColumnToColumnModel(
              assocMetaData,
              xmlColumn,
              columnNameComparatorType,
              typesBelongingCompositeTypeForComparatorType,
              defaultColumnValueType));
    }
    return columnsModel;
  }
  /**
   * If the key is to be associated with a valid value, a mutation is created for it with the given
   * table and columns. In the event the value in the column is missing (i.e., null), then it is
   * marked for {@link Deletion}. Similarly, if the entire value for a key is missing (i.e., null),
   * then the entire key is marked for {@link Deletion}.
   *
   * @param keyColumns the key to write.
   * @param values the values to write.
   * @throws IOException
   */
  @Override
  public void write(Map<String, ByteBuffer> keyColumns, List<ByteBuffer> values)
      throws IOException {
    TokenRange range = ringCache.getRange(getPartitionKey(keyColumns));

    // get the client for the given range, or create a new one
    final InetAddress address = ringCache.getEndpoints(range).get(0);
    RangeClient client = clients.get(address);
    if (client == null) {
      // haven't seen keys for this range: create new client
      client = new RangeClient(ringCache.getEndpoints(range));
      client.start();
      clients.put(address, client);
    }

    // add primary key columns to the bind variables
    List<ByteBuffer> allValues = new ArrayList<ByteBuffer>(values);
    for (ColumnMetadata column : partitionKeyColumns)
      allValues.add(keyColumns.get(column.getName()));
    for (ColumnMetadata column : clusterColumns) allValues.add(keyColumns.get(column.getName()));

    client.put(allValues);

    if (progressable != null) progressable.progress();
    if (context != null) HadoopCompat.progress(context);
  }
  @Override
  @SuppressWarnings("unchecked")
  protected void setup(Reducer.Context context) throws IOException, InterruptedException {
    this.context = context;
    String settingsStr = context.getConfiguration().get(ParameterProcessing.SETTINGS_STR);
    Settings settings = Settings.loadFromString(settingsStr);
    Settings.setSettings(settings);

    String projectStr = context.getConfiguration().get(ParameterProcessing.PROJECT);
    Project project = Project.loadFromString(projectStr);
    if (project.isEnvHadoop()) {
      String metadataFileContents =
          context.getConfiguration().get(ParameterProcessing.METADATA_FILE);
      new File(ColumnMetadata.metadataNamesFile).getParentFile().mkdirs();
      Files.write(metadataFileContents.getBytes(), new File(ColumnMetadata.metadataNamesFile));
    }
    columnMetadata = new ColumnMetadata();
    String fileSeparatorStr = project.getFieldSeparator();
    char fieldSeparatorChar = Delimiter.getDelim(fileSeparatorStr);
    columnMetadata.setFieldSeparator(String.valueOf(fieldSeparatorChar));
    columnMetadata.setAllMetadata(project.getMetadataCollect());
    // write standard metadata fields
    context.write(null, new Text(columnMetadata.delimiterSeparatedHeaders()));
    zipFileWriter.setup();
    zipFileWriter.openZipForWriting();

    luceneIndex = new LuceneIndex(settings.getLuceneIndexDir(), project.getProjectCode(), null);
    luceneIndex.init();
  }
  /** add where clauses for partition keys and cluster columns */
  private String appendKeyWhereClauses(String cqlQuery) {
    String keyWhereClause = "";

    for (ColumnMetadata partitionKey : partitionKeyColumns)
      keyWhereClause +=
          String.format(
              "%s = ?",
              keyWhereClause.isEmpty()
                  ? quote(partitionKey.getName())
                  : (" AND " + quote(partitionKey.getName())));
    for (ColumnMetadata clusterColumn : clusterColumns)
      keyWhereClause += " AND " + quote(clusterColumn.getName()) + " = ?";

    return cqlQuery + " WHERE " + keyWhereClause;
  }
  private ColumnMetadataModel mapXmlColumnMetadataToColumMetadataModel(
      ColumnMetadata xmlColumnMetadata) {
    ColumnMetadataModel columnMetadata = new ColumnMetadataModel();
    columnMetadata.setColumnName(xmlColumnMetadata.getName());
    columnMetadata.setValidationClass(
        ComparatorType.getByClassName(xmlColumnMetadata.getValidationClass().value()));
    if (xmlColumnMetadata.getIndexType() != null) {
      columnMetadata.setColumnIndexType(
          ColumnIndexType.valueOf(xmlColumnMetadata.getIndexType().value()));
    }

    columnMetadata.setIndexName(xmlColumnMetadata.getIndexName());

    return columnMetadata;
  }
Beispiel #6
0
  @Test
  public void should_iterate_row_with_metadata() throws IOException {
    // given
    String[] columnNames =
        new String[] {
          "id",
          "firstname",
          "lastname",
          "state",
          "registration",
          "city",
          "birth",
          "nbCommands",
          "avgAmount"
        };

    final InputStream input = this.getClass().getResourceAsStream("dataSetRowMetadata.json");
    final ObjectMapper mapper = builder.build();
    try (JsonParser parser = mapper.getFactory().createParser(input)) {
      final DataSet dataSet = mapper.readerFor(DataSet.class).readValue(parser);
      final Iterator<DataSetRow> iterator = dataSet.getRecords().iterator();

      List<ColumnMetadata> actualColumns = new ArrayList<>();
      int recordCount = 0;
      while (iterator.hasNext()) {
        final DataSetRow next = iterator.next();
        actualColumns = next.getRowMetadata().getColumns();
        assertThat(actualColumns, not(empty()));
        recordCount++;
      }

      // then
      assertEquals(10, recordCount);
      for (int i = 0; i < actualColumns.size(); i++) {
        final ColumnMetadata column = actualColumns.get(i);
        assertEquals(columnNames[i], column.getId());
      }
    } catch (Exception e) {
      throw new TDPException(CommonErrorCodes.UNABLE_TO_PARSE_JSON, e);
    }
  }
  /**
   * map an xml column to a column
   *
   * @param xmlColumn xml column
   * @param typesBelongingCompositeTypeForComparatorType
   * @return column
   */
  private ColumnModel mapXmlColumnToColumnModel(
      ColumnMetadata metaData,
      Column xmlColumn,
      ComparatorType comparatorType,
      GenericTypeEnum[] typesBelongingCompositeTypeForComparatorType,
      ComparatorType defaultColumnValueType) {
    ColumnModel columnModel = new ColumnModel();

    if (comparatorType == null) {
      columnModel.setName(new GenericType(xmlColumn.getName(), GenericTypeEnum.BYTES_TYPE));
    } else if (ComparatorType.COMPOSITETYPE.getTypeName().equals(comparatorType.getTypeName())) {
      /* composite type */
      try {
        columnModel.setName(
            new GenericType(
                StringUtils.split(xmlColumn.getName(), ":"),
                typesBelongingCompositeTypeForComparatorType));
      } catch (IllegalArgumentException e) {
        throw new ParseException(
            xmlColumn.getName()
                + " doesn't fit with the schema declaration of your composite type");
      }
    } else {
      /* simple type */
      columnModel.setName(
          new GenericType(
              xmlColumn.getName(), GenericTypeEnum.fromValue(comparatorType.getTypeName())));
    }

    if (defaultColumnValueType != null
        && ComparatorType.COUNTERTYPE.getClassName().equals(defaultColumnValueType.getClassName())
        && TypeExtractor.containFunctions(xmlColumn.getValue())) {
      throw new ParseException("Impossible to override Column value into a Counter column family");
    }

    GenericType columnValue = null;
    if (metaData != null) {
      GenericTypeEnum genTypeEnum = GenericTypeEnum.valueOf(metaData.getValidationClass().name());
      columnValue = new GenericType(xmlColumn.getValue(), genTypeEnum);
    } else {
      columnValue = TypeExtractor.extract(xmlColumn.getValue(), defaultColumnValueType);
    }
    columnModel.setValue(columnValue);

    String timestamp = xmlColumn.getTimestamp();
    if (timestamp != null) {
      columnModel.setTimestamp(Long.valueOf(timestamp));
    } else {
      columnModel.setTimestamp(null);
    }

    return columnModel;
  }
  @Override
  @SuppressWarnings("unchecked")
  protected void cleanup(Reducer.Context context) throws IOException, InterruptedException {
    if (!Project.getProject().isMetadataCollectStandard()) {
      // write summary headers with all metadata, but for standard metadata don't write the last
      // line
      context.write(new Text("Hash"), new Text(columnMetadata.delimiterSeparatedHeaders()));
    }
    zipFileWriter.closeZip();

    if (Project.getProject().isLuceneIndexEnabled()) {
      mergeLuceneIndex();
    }

    Project project = Project.getProject();
    if (project.isEnvHadoop()) {
      String outputPath = Project.getProject().getProperty(ParameterProcessing.OUTPUT_DIR_HADOOP);
      String zipFileName = zipFileWriter.getZipFileName();
      if (project.isFsHdfs()) {
        String cmd =
            "hadoop fs -copyFromLocal "
                + zipFileName
                + " "
                + outputPath
                + File.separator
                + context.getTaskAttemptID()
                + ".zip";
        OsUtil.runCommand(cmd);
      } else if (project.isFsS3()) {
        S3Agent s3agent = new S3Agent();
        String run = project.getRun();
        if (!run.isEmpty()) {
          run = run + "/";
        }
        String s3key =
            project.getProjectCode()
                + File.separator
                + "output/"
                + run
                + "results/"
                + context.getTaskAttemptID()
                + ".zip";
        // Keep updating the hadoop progress
        int refreshInterval = 60000;
        Timer timer = new Timer(refreshInterval, this);
        timer.start();
        s3agent.putFileInS3(zipFileName, s3key);
        timer.stop();
      }
    }
    Stats.getInstance().setJobFinished();
  }
Beispiel #9
0
  @Test
  public void testRead1() throws Exception {

    DataSet dataSet = from(this.getClass().getResourceAsStream("test1.json"));
    assertNotNull(dataSet);

    final DataSetMetadata metadata = dataSet.getMetadata();
    assertEquals("410d2196-8f90-478f-a817-7e8b6694ac91", metadata.getId());
    assertEquals("test", metadata.getName());
    assertEquals("anonymousUser", metadata.getAuthor());
    assertEquals(2, metadata.getContent().getNbRecords());
    assertEquals(1, metadata.getContent().getNbLinesInHeader());
    assertEquals(0, metadata.getContent().getNbLinesInFooter());

    final SimpleDateFormat dateFormat = new SimpleDateFormat("MM-dd-yyyy HH:mm");
    dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));

    Date expectedDate = dateFormat.parse("02-17-2015 09:02");
    assertEquals(expectedDate, new Date(metadata.getCreationDate()));

    List<ColumnMetadata> columns = dataSet.getMetadata().getRowMetadata().getColumns();
    assertEquals(6, columns.size());

    ColumnMetadata firstColumn = columns.get(0);
    assertEquals("0001", firstColumn.getId());
    assertEquals("id", firstColumn.getName());
    assertEquals("integer", firstColumn.getType());
    assertEquals(20, firstColumn.getQuality().getEmpty());
    assertEquals(26, firstColumn.getQuality().getInvalid());
    assertEquals(54, firstColumn.getQuality().getValid());

    ColumnMetadata lastColumn = columns.get(5);
    assertEquals("0007", lastColumn.getId());
    assertEquals("string", lastColumn.getType());
    assertEquals(8, lastColumn.getQuality().getEmpty());
    assertEquals(25, lastColumn.getQuality().getInvalid());
    assertEquals(67, lastColumn.getQuality().getValid());
  }
  protected void processMap(MapWritable value) throws IOException, InterruptedException {
    columnMetadata.reinit();
    ++outputFileCount;
    DocumentMetadata allMetadata = getAllMetadata(value);
    Metadata standardMetadata = getStandardMetadata(allMetadata, outputFileCount);
    columnMetadata.addMetadata(standardMetadata);
    columnMetadata.addMetadata(allMetadata);
    // documents other than the first one in this loop are either duplicates or attachments
    if (first) {
      masterOutputFileCount = outputFileCount;
    } else {
      if (allMetadata.hasParent()) {
        columnMetadata.addMetadataValue(
            DocumentMetadataKeys.ATTACHMENT_PARENT, UPIFormat.format(masterOutputFileCount));
      } else {
        columnMetadata.addMetadataValue(
            DocumentMetadataKeys.MASTER_DUPLICATE, UPIFormat.format(masterOutputFileCount));
      }
    }

    // String uniqueId = allMetadata.getUniqueId();

    String originalFileName =
        new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName();
    // add the text to the text folder
    String documentText = allMetadata.get(DocumentMetadataKeys.DOCUMENT_TEXT);
    String textEntryName =
        ParameterProcessing.TEXT
            + "/"
            + UPIFormat.format(outputFileCount)
            + "_"
            + originalFileName
            + ".txt";
    if (textEntryName != null) {
      zipFileWriter.addTextFile(textEntryName, documentText);
    }
    columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_TEXT, textEntryName);
    // add the native file to the native folder
    String nativeEntryName =
        ParameterProcessing.NATIVE
            + "/"
            + UPIFormat.format(outputFileCount)
            + "_"
            + originalFileName;
    BytesWritable bytesWritable = (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE));
    if (bytesWritable != null) { // some large exception files are not passed
      zipFileWriter.addBinaryFile(
          nativeEntryName, bytesWritable.getBytes(), bytesWritable.getLength());
      logger.trace("Processing file: {}", nativeEntryName);
    }
    columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_NATIVE, nativeEntryName);
    // add the pdf made from native to the PDF folder
    String pdfNativeEntryName =
        ParameterProcessing.PDF_FOLDER
            + "/"
            + UPIFormat.format(outputFileCount)
            + "_"
            + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName()
            + ".pdf";
    BytesWritable pdfBytesWritable =
        (BytesWritable) value.get(new Text(ParameterProcessing.NATIVE_AS_PDF));
    if (pdfBytesWritable != null) {
      zipFileWriter.addBinaryFile(
          pdfNativeEntryName, pdfBytesWritable.getBytes(), pdfBytesWritable.getLength());
      logger.trace("Processing file: {}", pdfNativeEntryName);
    }

    processHtmlContent(value, allMetadata, UPIFormat.format(outputFileCount));

    // add exception to the exception folder
    String exception = allMetadata.get(DocumentMetadataKeys.PROCESSING_EXCEPTION);
    if (exception != null) {
      String exceptionEntryName =
          "exception/"
              + UPIFormat.format(outputFileCount)
              + "_"
              + new File(allMetadata.get(DocumentMetadataKeys.DOCUMENT_ORIGINAL_PATH)).getName();
      if (bytesWritable != null) {
        zipFileWriter.addBinaryFile(
            exceptionEntryName, bytesWritable.getBytes(), bytesWritable.getLength());
      }
      columnMetadata.addMetadataValue(DocumentMetadataKeys.LINK_EXCEPTION, exceptionEntryName);
    }
    // write this all to the reduce map
    // context.write(new Text(outputKey), new Text(columnMetadata.delimiterSeparatedValues()));
    // drop the key altogether, because it messes up the format - but put it in the value
    // TODO use NullWritable
    if (OsUtil.isNix()) {
      context.write(null, new Text(columnMetadata.delimiterSeparatedValues()));
    }
    // prepare for the next file with the same key, if there is any
    first = false;
  }