private void processHiveEntity(
     HiveMetaStoreBridge dgiBridge,
     HiveEventContext event,
     Entity entity,
     Set<String> dataSetsProcessed,
     SortedMap<Entity, Referenceable> dataSets,
     Set<Referenceable> entities)
     throws Exception {
   if (entity.getType() == Type.TABLE || entity.getType() == Type.PARTITION) {
     final String tblQFName =
         dgiBridge.getTableQualifiedName(dgiBridge.getClusterName(), entity.getTable());
     if (!dataSetsProcessed.contains(tblQFName)) {
       LinkedHashMap<Type, Referenceable> result =
           createOrUpdateEntities(dgiBridge, event, entity, false);
       dataSets.put(entity, result.get(Type.TABLE));
       dataSetsProcessed.add(tblQFName);
       entities.addAll(result.values());
     }
   } else if (entity.getType() == Type.DFS_DIR) {
     final String pathUri = lower(new Path(entity.getLocation()).toString());
     LOG.info("Registering DFS Path {} ", pathUri);
     if (!dataSetsProcessed.contains(pathUri)) {
       Referenceable hdfsPath = dgiBridge.fillHDFSDataSet(pathUri);
       dataSets.put(entity, hdfsPath);
       dataSetsProcessed.add(pathUri);
       entities.add(hdfsPath);
     }
   }
 }
  private Referenceable replaceSDQFName(
      final HiveEventContext event,
      Referenceable tableEntity,
      final String oldTblQFName,
      final String newTblQFName) {
    // Reset storage desc QF Name to old Name
    final Referenceable sdRef =
        ((Referenceable) tableEntity.get(HiveDataModelGenerator.STORAGE_DESC));
    sdRef.set(
        AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME,
        HiveMetaStoreBridge.getStorageDescQFName(oldTblQFName));

    // Replace SD QF name first to retain tags
    final String oldSDQFName = HiveMetaStoreBridge.getStorageDescQFName(oldTblQFName);
    final String newSDQFName = HiveMetaStoreBridge.getStorageDescQFName(newTblQFName);

    final Referenceable newSDEntity = new Referenceable(HiveDataTypes.HIVE_STORAGEDESC.getName());
    newSDEntity.set(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, newSDQFName);
    event.addMessage(
        new HookNotification.EntityPartialUpdateRequest(
            event.getUser(),
            HiveDataTypes.HIVE_STORAGEDESC.getName(),
            AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME,
            oldSDQFName,
            newSDEntity));

    return newSDEntity;
  }
  private List<Referenceable> replaceColumnQFName(
      final HiveEventContext event,
      final List<Referenceable> cols,
      final String oldTableQFName,
      final String newTableQFName) {
    List<Referenceable> newColEntities = new ArrayList<>();
    for (Referenceable col : cols) {
      final String colName = (String) col.get(AtlasClient.NAME);
      String oldColumnQFName = HiveMetaStoreBridge.getColumnQualifiedName(oldTableQFName, colName);
      String newColumnQFName = HiveMetaStoreBridge.getColumnQualifiedName(newTableQFName, colName);
      col.set(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, oldColumnQFName);

      Referenceable newColEntity = new Referenceable(HiveDataTypes.HIVE_COLUMN.getName());
      /// Only QF Name changes
      newColEntity.set(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, newColumnQFName);
      event.addMessage(
          new HookNotification.EntityPartialUpdateRequest(
              event.getUser(),
              HiveDataTypes.HIVE_COLUMN.getName(),
              AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME,
              oldColumnQFName,
              newColEntity));
      newColEntities.add(newColEntity);
    }
    return newColEntities;
  }
  private void renameTable(HiveMetaStoreBridge dgiBridge, HiveEventContext event) throws Exception {
    // crappy, no easy of getting new name
    assert event.getInputs() != null && event.getInputs().size() == 1;
    assert event.getOutputs() != null && event.getOutputs().size() > 0;

    // Update entity if not exists
    ReadEntity oldEntity = event.getInputs().iterator().next();
    Table oldTable = oldEntity.getTable();

    for (WriteEntity writeEntity : event.getOutputs()) {
      if (writeEntity.getType() == Entity.Type.TABLE) {
        Table newTable = writeEntity.getTable();
        // Hive sends with both old and new table names in the outputs which is weird. So skipping
        // that with the below check
        if (!newTable.getDbName().equals(oldTable.getDbName())
            || !newTable.getTableName().equals(oldTable.getTableName())) {
          final String oldQualifiedName =
              dgiBridge.getTableQualifiedName(dgiBridge.getClusterName(), oldTable);
          final String newQualifiedName =
              dgiBridge.getTableQualifiedName(dgiBridge.getClusterName(), newTable);

          // Create/update old table entity - create entity with oldQFNme and old tableName if it
          // doesnt exist. If exists, will update
          // We always use the new entity while creating the table since some flags, attributes of
          // the table are not set in inputEntity and Hive.getTable(oldTableName) also fails since
          // the table doesnt exist in hive anymore
          final LinkedHashMap<Type, Referenceable> tables =
              createOrUpdateEntities(dgiBridge, event, writeEntity, true);
          Referenceable tableEntity = tables.get(Type.TABLE);

          // Reset regular column QF Name to old Name and create a new partial notification request
          // to replace old column QFName to newName to retain any existing traits
          replaceColumnQFName(
              event,
              (List<Referenceable>) tableEntity.get(HiveDataModelGenerator.COLUMNS),
              oldQualifiedName,
              newQualifiedName);

          // Reset partition key column QF Name to old Name and create a new partial notification
          // request to replace old column QFName to newName to retain any existing traits
          replaceColumnQFName(
              event,
              (List<Referenceable>) tableEntity.get(HiveDataModelGenerator.PART_COLS),
              oldQualifiedName,
              newQualifiedName);

          // Reset SD QF Name to old Name and create a new partial notification request to replace
          // old SD QFName to newName to retain any existing traits
          replaceSDQFName(event, tableEntity, oldQualifiedName, newQualifiedName);

          // Reset Table QF Name to old Name and create a new partial notification request to
          // replace old Table QFName to newName
          replaceTableQFName(
              event, oldTable, newTable, tableEntity, oldQualifiedName, newQualifiedName);
        }
      }
    }
  }
 private void deleteTable(
     HiveMetaStoreBridge dgiBridge, HiveEventContext event, WriteEntity output) {
   final String tblQualifiedName =
       HiveMetaStoreBridge.getTableQualifiedName(dgiBridge.getClusterName(), output.getTable());
   LOG.info("Deleting table {} ", tblQualifiedName);
   event.addMessage(
       new HookNotification.EntityDeleteRequest(
           event.getUser(),
           HiveDataTypes.HIVE_TABLE.getName(),
           AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME,
           tblQualifiedName));
 }
Example #6
0
  @Test
  public void testCreateTable() throws Exception {
    String tableName = tableName();
    String dbName = createDatabase();
    String colName = "col" + random();
    runCommand("create table " + dbName + "." + tableName + "(" + colName + " int, name string)");
    assertTableIsRegistered(dbName, tableName);

    // there is only one instance of column registered
    String colId = assertColumnIsRegistered(colName);
    Referenceable colEntity = dgiCLient.getEntity(colId);
    Assert.assertEquals(
        colEntity.get("qualifiedName"),
        String.format(
            "%s.%s.%s@%s",
            dbName.toLowerCase(), tableName.toLowerCase(), colName.toLowerCase(), CLUSTER_NAME));

    tableName = createTable();
    String tableId = assertTableIsRegistered(DEFAULT_DB, tableName);
    Referenceable tableRef = dgiCLient.getEntity(tableId);
    Assert.assertEquals(tableRef.get("tableType"), TableType.MANAGED_TABLE.name());
    Assert.assertEquals(tableRef.get(HiveDataModelGenerator.COMMENT), "table comment");
    String entityName =
        HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, tableName);
    Assert.assertEquals(tableRef.get(HiveDataModelGenerator.NAME), entityName);
    Assert.assertEquals(
        tableRef.get("name"), "default." + tableName.toLowerCase() + "@" + CLUSTER_NAME);

    final Referenceable sdRef = (Referenceable) tableRef.get("sd");
    Assert.assertEquals(sdRef.get(HiveDataModelGenerator.STORAGE_IS_STORED_AS_SUB_DIRS), false);

    // Create table where database doesn't exist, will create database instance as well
    assertDatabaseIsRegistered(DEFAULT_DB);
  }
  private void handleExternalTables(
      final HiveMetaStoreBridge dgiBridge,
      final HiveEventContext event,
      final LinkedHashMap<Type, Referenceable> tables)
      throws HiveException, MalformedURLException {
    List<Referenceable> entities = new ArrayList<>();
    final Entity hiveEntity = getEntityByType(event.getOutputs(), Type.TABLE);
    Table hiveTable = hiveEntity.getTable();
    // Refresh to get the correct location
    hiveTable = dgiBridge.hiveClient.getTable(hiveTable.getDbName(), hiveTable.getTableName());

    final String location = lower(hiveTable.getDataLocation().toString());
    if (hiveTable != null && TableType.EXTERNAL_TABLE.equals(hiveTable.getTableType())) {
      LOG.info("Registering external table process {} ", event.getQueryStr());
      final ReadEntity dfsEntity = new ReadEntity();
      dfsEntity.setTyp(Type.DFS_DIR);
      dfsEntity.setName(location);

      SortedMap<Entity, Referenceable> inputs =
          new TreeMap<Entity, Referenceable>(entityComparator) {
            {
              put(dfsEntity, dgiBridge.fillHDFSDataSet(location));
            }
          };

      SortedMap<Entity, Referenceable> outputs =
          new TreeMap<Entity, Referenceable>(entityComparator) {
            {
              put(hiveEntity, tables.get(Type.TABLE));
            }
          };

      Referenceable processReferenceable =
          getProcessReferenceable(dgiBridge, event, inputs, outputs);
      String tableQualifiedName =
          dgiBridge.getTableQualifiedName(dgiBridge.getClusterName(), hiveTable);

      if (isCreateOp(event)) {
        processReferenceable.set(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, tableQualifiedName);
      }
      entities.addAll(tables.values());
      entities.add(processReferenceable);
      event.addMessage(new HookNotification.EntityUpdateRequest(event.getUser(), entities));
    }
  }
  private void deleteDatabase(HiveMetaStoreBridge dgiBridge, HiveEventContext event) {
    if (event.getOutputs().size() > 1) {
      LOG.info("Starting deletion of tables and databases with cascade {} ", event.getQueryStr());
    }

    for (WriteEntity output : event.getOutputs()) {
      if (Type.TABLE.equals(output.getType())) {
        deleteTable(dgiBridge, event, output);
      } else if (Type.DATABASE.equals(output.getType())) {
        final String dbQualifiedName =
            HiveMetaStoreBridge.getDBQualifiedName(
                dgiBridge.getClusterName(), output.getDatabase().getName());
        event.addMessage(
            new HookNotification.EntityDeleteRequest(
                event.getUser(),
                HiveDataTypes.HIVE_DB.getName(),
                AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME,
                dbQualifiedName));
      }
    }
  }
  private void renameColumn(HiveMetaStoreBridge dgiBridge, HiveEventContext event)
      throws Exception {
    assert event.getInputs() != null && event.getInputs().size() == 1;
    assert event.getOutputs() != null && event.getOutputs().size() > 0;

    Table oldTable = event.getInputs().iterator().next().getTable();
    List<FieldSchema> oldColList = oldTable.getAllCols();
    Table outputTbl = event.getOutputs().iterator().next().getTable();
    outputTbl = dgiBridge.hiveClient.getTable(outputTbl.getDbName(), outputTbl.getTableName());
    List<FieldSchema> newColList = outputTbl.getAllCols();
    assert oldColList.size() == newColList.size();

    Pair<String, String> changedColNamePair = findChangedColNames(oldColList, newColList);
    String oldColName = changedColNamePair.getLeft();
    String newColName = changedColNamePair.getRight();
    for (WriteEntity writeEntity : event.getOutputs()) {
      if (writeEntity.getType() == Type.TABLE) {
        Table newTable = writeEntity.getTable();
        createOrUpdateEntities(dgiBridge, event, writeEntity, true, oldTable);
        final String newQualifiedTableName =
            dgiBridge.getTableQualifiedName(dgiBridge.getClusterName(), newTable);
        String oldColumnQFName =
            HiveMetaStoreBridge.getColumnQualifiedName(newQualifiedTableName, oldColName);
        String newColumnQFName =
            HiveMetaStoreBridge.getColumnQualifiedName(newQualifiedTableName, newColName);
        Referenceable newColEntity = new Referenceable(HiveDataTypes.HIVE_COLUMN.getName());
        newColEntity.set(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, newColumnQFName);

        event.addMessage(
            new HookNotification.EntityPartialUpdateRequest(
                event.getUser(),
                HiveDataTypes.HIVE_COLUMN.getName(),
                AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME,
                oldColumnQFName,
                newColEntity));
      }
    }
    handleEventOutputs(dgiBridge, event, Type.TABLE);
  }
Example #10
0
  @Test
  public void testLineage() throws Exception {
    String table1 = createTable(false);

    String db2 = createDatabase();
    String table2 = tableName();

    String query = String.format("create table %s.%s as select * from %s", db2, table2, table1);
    runCommand(query);
    String table1Id = assertTableIsRegistered(DEFAULT_DB, table1);
    String table2Id = assertTableIsRegistered(db2, table2);

    String datasetName = HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, db2, table2);
    JSONObject response = dgiCLient.getInputGraph(datasetName);
    JSONObject vertices = response.getJSONObject("values").getJSONObject("vertices");
    Assert.assertTrue(vertices.has(table1Id));
    Assert.assertTrue(vertices.has(table2Id));

    datasetName = HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, table1);
    response = dgiCLient.getOutputGraph(datasetName);
    vertices = response.getJSONObject("values").getJSONObject("vertices");
    Assert.assertTrue(vertices.has(table1Id));
    Assert.assertTrue(vertices.has(table2Id));
  }
  private Referenceable getProcessReferenceable(
      HiveMetaStoreBridge dgiBridge,
      HiveEventContext hiveEvent,
      SortedMap<Entity, Referenceable> source,
      SortedMap<Entity, Referenceable> target) {
    Referenceable processReferenceable = new Referenceable(HiveDataTypes.HIVE_PROCESS.getName());

    String queryStr = lower(hiveEvent.getQueryStr());
    processReferenceable.set(
        AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME,
        getProcessQualifiedName(hiveEvent.getOperation(), source, target));

    LOG.debug("Registering query: {}", queryStr);
    List<Referenceable> sourceList = new ArrayList<>(source.values());
    List<Referenceable> targetList = new ArrayList<>(target.values());

    // The serialization code expected a list
    if (sourceList != null && !sourceList.isEmpty()) {
      processReferenceable.set("inputs", sourceList);
    }
    if (targetList != null && !targetList.isEmpty()) {
      processReferenceable.set("outputs", targetList);
    }
    processReferenceable.set(AtlasClient.NAME, queryStr);

    processReferenceable.set("operationType", hiveEvent.getOperation().getOperationName());
    processReferenceable.set("startTime", new Date(hiveEvent.getQueryStartTime()));
    processReferenceable.set("userName", hiveEvent.getUser());
    processReferenceable.set("queryText", queryStr);
    processReferenceable.set("queryId", hiveEvent.getQueryId());
    processReferenceable.set("queryPlan", hiveEvent.getJsonPlan());
    processReferenceable.set(AtlasConstants.CLUSTER_NAME_ATTRIBUTE, dgiBridge.getClusterName());

    List<String> recentQueries = new ArrayList<>(1);
    recentQueries.add(queryStr);
    processReferenceable.set("recentQueries", recentQueries);

    processReferenceable.set("endTime", new Date(System.currentTimeMillis()));
    // TODO set queryGraph
    return processReferenceable;
  }
  private LinkedHashMap<Type, Referenceable> createOrUpdateEntities(
      HiveMetaStoreBridge dgiBridge,
      HiveEventContext event,
      Entity entity,
      boolean skipTempTables,
      Table existTable)
      throws Exception {
    Database db = null;
    Table table = null;
    Partition partition = null;
    LinkedHashMap<Type, Referenceable> result = new LinkedHashMap<>();
    List<Referenceable> entities = new ArrayList<>();

    switch (entity.getType()) {
      case DATABASE:
        db = entity.getDatabase();
        break;

      case TABLE:
        table = entity.getTable();
        db = dgiBridge.hiveClient.getDatabase(table.getDbName());
        break;

      case PARTITION:
        partition = entity.getPartition();
        table = partition.getTable();
        db = dgiBridge.hiveClient.getDatabase(table.getDbName());
        break;
    }

    db = dgiBridge.hiveClient.getDatabase(db.getName());
    Referenceable dbEntity = dgiBridge.createDBInstance(db);

    entities.add(dbEntity);
    result.put(Type.DATABASE, dbEntity);

    Referenceable tableEntity = null;

    if (table != null) {
      if (existTable != null) {
        table = existTable;
      } else {
        table = dgiBridge.hiveClient.getTable(table.getDbName(), table.getTableName());
      }
      // If its an external table, even though the temp table skip flag is on,
      // we create the table since we need the HDFS path to temp table lineage.
      if (skipTempTables
          && table.isTemporary()
          && !TableType.EXTERNAL_TABLE.equals(table.getTableType())) {
        LOG.debug(
            "Skipping temporary table registration {} since it is not an external table {} ",
            table.getTableName(),
            table.getTableType().name());

      } else {
        tableEntity = dgiBridge.createTableInstance(dbEntity, table);
        entities.add(tableEntity);
        result.put(Type.TABLE, tableEntity);
      }
    }

    event.addMessage(new HookNotification.EntityUpdateRequest(event.getUser(), entities));
    return result;
  }