private void processHiveEntity( HiveMetaStoreBridge dgiBridge, HiveEventContext event, Entity entity, Set<String> dataSetsProcessed, SortedMap<Entity, Referenceable> dataSets, Set<Referenceable> entities) throws Exception { if (entity.getType() == Type.TABLE || entity.getType() == Type.PARTITION) { final String tblQFName = dgiBridge.getTableQualifiedName(dgiBridge.getClusterName(), entity.getTable()); if (!dataSetsProcessed.contains(tblQFName)) { LinkedHashMap<Type, Referenceable> result = createOrUpdateEntities(dgiBridge, event, entity, false); dataSets.put(entity, result.get(Type.TABLE)); dataSetsProcessed.add(tblQFName); entities.addAll(result.values()); } } else if (entity.getType() == Type.DFS_DIR) { final String pathUri = lower(new Path(entity.getLocation()).toString()); LOG.info("Registering DFS Path {} ", pathUri); if (!dataSetsProcessed.contains(pathUri)) { Referenceable hdfsPath = dgiBridge.fillHDFSDataSet(pathUri); dataSets.put(entity, hdfsPath); dataSetsProcessed.add(pathUri); entities.add(hdfsPath); } } }
private Referenceable replaceSDQFName( final HiveEventContext event, Referenceable tableEntity, final String oldTblQFName, final String newTblQFName) { // Reset storage desc QF Name to old Name final Referenceable sdRef = ((Referenceable) tableEntity.get(HiveDataModelGenerator.STORAGE_DESC)); sdRef.set( AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, HiveMetaStoreBridge.getStorageDescQFName(oldTblQFName)); // Replace SD QF name first to retain tags final String oldSDQFName = HiveMetaStoreBridge.getStorageDescQFName(oldTblQFName); final String newSDQFName = HiveMetaStoreBridge.getStorageDescQFName(newTblQFName); final Referenceable newSDEntity = new Referenceable(HiveDataTypes.HIVE_STORAGEDESC.getName()); newSDEntity.set(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, newSDQFName); event.addMessage( new HookNotification.EntityPartialUpdateRequest( event.getUser(), HiveDataTypes.HIVE_STORAGEDESC.getName(), AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, oldSDQFName, newSDEntity)); return newSDEntity; }
private List<Referenceable> replaceColumnQFName( final HiveEventContext event, final List<Referenceable> cols, final String oldTableQFName, final String newTableQFName) { List<Referenceable> newColEntities = new ArrayList<>(); for (Referenceable col : cols) { final String colName = (String) col.get(AtlasClient.NAME); String oldColumnQFName = HiveMetaStoreBridge.getColumnQualifiedName(oldTableQFName, colName); String newColumnQFName = HiveMetaStoreBridge.getColumnQualifiedName(newTableQFName, colName); col.set(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, oldColumnQFName); Referenceable newColEntity = new Referenceable(HiveDataTypes.HIVE_COLUMN.getName()); /// Only QF Name changes newColEntity.set(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, newColumnQFName); event.addMessage( new HookNotification.EntityPartialUpdateRequest( event.getUser(), HiveDataTypes.HIVE_COLUMN.getName(), AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, oldColumnQFName, newColEntity)); newColEntities.add(newColEntity); } return newColEntities; }
private void renameTable(HiveMetaStoreBridge dgiBridge, HiveEventContext event) throws Exception { // crappy, no easy of getting new name assert event.getInputs() != null && event.getInputs().size() == 1; assert event.getOutputs() != null && event.getOutputs().size() > 0; // Update entity if not exists ReadEntity oldEntity = event.getInputs().iterator().next(); Table oldTable = oldEntity.getTable(); for (WriteEntity writeEntity : event.getOutputs()) { if (writeEntity.getType() == Entity.Type.TABLE) { Table newTable = writeEntity.getTable(); // Hive sends with both old and new table names in the outputs which is weird. So skipping // that with the below check if (!newTable.getDbName().equals(oldTable.getDbName()) || !newTable.getTableName().equals(oldTable.getTableName())) { final String oldQualifiedName = dgiBridge.getTableQualifiedName(dgiBridge.getClusterName(), oldTable); final String newQualifiedName = dgiBridge.getTableQualifiedName(dgiBridge.getClusterName(), newTable); // Create/update old table entity - create entity with oldQFNme and old tableName if it // doesnt exist. If exists, will update // We always use the new entity while creating the table since some flags, attributes of // the table are not set in inputEntity and Hive.getTable(oldTableName) also fails since // the table doesnt exist in hive anymore final LinkedHashMap<Type, Referenceable> tables = createOrUpdateEntities(dgiBridge, event, writeEntity, true); Referenceable tableEntity = tables.get(Type.TABLE); // Reset regular column QF Name to old Name and create a new partial notification request // to replace old column QFName to newName to retain any existing traits replaceColumnQFName( event, (List<Referenceable>) tableEntity.get(HiveDataModelGenerator.COLUMNS), oldQualifiedName, newQualifiedName); // Reset partition key column QF Name to old Name and create a new partial notification // request to replace old column QFName to newName to retain any existing traits replaceColumnQFName( event, (List<Referenceable>) tableEntity.get(HiveDataModelGenerator.PART_COLS), oldQualifiedName, newQualifiedName); // Reset SD QF Name to old Name and create a new partial notification request to replace // old SD QFName to newName to retain any existing traits replaceSDQFName(event, tableEntity, oldQualifiedName, newQualifiedName); // Reset Table QF Name to old Name and create a new partial notification request to // replace old Table QFName to newName replaceTableQFName( event, oldTable, newTable, tableEntity, oldQualifiedName, newQualifiedName); } } } }
private void deleteTable( HiveMetaStoreBridge dgiBridge, HiveEventContext event, WriteEntity output) { final String tblQualifiedName = HiveMetaStoreBridge.getTableQualifiedName(dgiBridge.getClusterName(), output.getTable()); LOG.info("Deleting table {} ", tblQualifiedName); event.addMessage( new HookNotification.EntityDeleteRequest( event.getUser(), HiveDataTypes.HIVE_TABLE.getName(), AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, tblQualifiedName)); }
@Test public void testCreateTable() throws Exception { String tableName = tableName(); String dbName = createDatabase(); String colName = "col" + random(); runCommand("create table " + dbName + "." + tableName + "(" + colName + " int, name string)"); assertTableIsRegistered(dbName, tableName); // there is only one instance of column registered String colId = assertColumnIsRegistered(colName); Referenceable colEntity = dgiCLient.getEntity(colId); Assert.assertEquals( colEntity.get("qualifiedName"), String.format( "%s.%s.%s@%s", dbName.toLowerCase(), tableName.toLowerCase(), colName.toLowerCase(), CLUSTER_NAME)); tableName = createTable(); String tableId = assertTableIsRegistered(DEFAULT_DB, tableName); Referenceable tableRef = dgiCLient.getEntity(tableId); Assert.assertEquals(tableRef.get("tableType"), TableType.MANAGED_TABLE.name()); Assert.assertEquals(tableRef.get(HiveDataModelGenerator.COMMENT), "table comment"); String entityName = HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, tableName); Assert.assertEquals(tableRef.get(HiveDataModelGenerator.NAME), entityName); Assert.assertEquals( tableRef.get("name"), "default." + tableName.toLowerCase() + "@" + CLUSTER_NAME); final Referenceable sdRef = (Referenceable) tableRef.get("sd"); Assert.assertEquals(sdRef.get(HiveDataModelGenerator.STORAGE_IS_STORED_AS_SUB_DIRS), false); // Create table where database doesn't exist, will create database instance as well assertDatabaseIsRegistered(DEFAULT_DB); }
private void handleExternalTables( final HiveMetaStoreBridge dgiBridge, final HiveEventContext event, final LinkedHashMap<Type, Referenceable> tables) throws HiveException, MalformedURLException { List<Referenceable> entities = new ArrayList<>(); final Entity hiveEntity = getEntityByType(event.getOutputs(), Type.TABLE); Table hiveTable = hiveEntity.getTable(); // Refresh to get the correct location hiveTable = dgiBridge.hiveClient.getTable(hiveTable.getDbName(), hiveTable.getTableName()); final String location = lower(hiveTable.getDataLocation().toString()); if (hiveTable != null && TableType.EXTERNAL_TABLE.equals(hiveTable.getTableType())) { LOG.info("Registering external table process {} ", event.getQueryStr()); final ReadEntity dfsEntity = new ReadEntity(); dfsEntity.setTyp(Type.DFS_DIR); dfsEntity.setName(location); SortedMap<Entity, Referenceable> inputs = new TreeMap<Entity, Referenceable>(entityComparator) { { put(dfsEntity, dgiBridge.fillHDFSDataSet(location)); } }; SortedMap<Entity, Referenceable> outputs = new TreeMap<Entity, Referenceable>(entityComparator) { { put(hiveEntity, tables.get(Type.TABLE)); } }; Referenceable processReferenceable = getProcessReferenceable(dgiBridge, event, inputs, outputs); String tableQualifiedName = dgiBridge.getTableQualifiedName(dgiBridge.getClusterName(), hiveTable); if (isCreateOp(event)) { processReferenceable.set(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, tableQualifiedName); } entities.addAll(tables.values()); entities.add(processReferenceable); event.addMessage(new HookNotification.EntityUpdateRequest(event.getUser(), entities)); } }
private void deleteDatabase(HiveMetaStoreBridge dgiBridge, HiveEventContext event) { if (event.getOutputs().size() > 1) { LOG.info("Starting deletion of tables and databases with cascade {} ", event.getQueryStr()); } for (WriteEntity output : event.getOutputs()) { if (Type.TABLE.equals(output.getType())) { deleteTable(dgiBridge, event, output); } else if (Type.DATABASE.equals(output.getType())) { final String dbQualifiedName = HiveMetaStoreBridge.getDBQualifiedName( dgiBridge.getClusterName(), output.getDatabase().getName()); event.addMessage( new HookNotification.EntityDeleteRequest( event.getUser(), HiveDataTypes.HIVE_DB.getName(), AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, dbQualifiedName)); } } }
private void renameColumn(HiveMetaStoreBridge dgiBridge, HiveEventContext event) throws Exception { assert event.getInputs() != null && event.getInputs().size() == 1; assert event.getOutputs() != null && event.getOutputs().size() > 0; Table oldTable = event.getInputs().iterator().next().getTable(); List<FieldSchema> oldColList = oldTable.getAllCols(); Table outputTbl = event.getOutputs().iterator().next().getTable(); outputTbl = dgiBridge.hiveClient.getTable(outputTbl.getDbName(), outputTbl.getTableName()); List<FieldSchema> newColList = outputTbl.getAllCols(); assert oldColList.size() == newColList.size(); Pair<String, String> changedColNamePair = findChangedColNames(oldColList, newColList); String oldColName = changedColNamePair.getLeft(); String newColName = changedColNamePair.getRight(); for (WriteEntity writeEntity : event.getOutputs()) { if (writeEntity.getType() == Type.TABLE) { Table newTable = writeEntity.getTable(); createOrUpdateEntities(dgiBridge, event, writeEntity, true, oldTable); final String newQualifiedTableName = dgiBridge.getTableQualifiedName(dgiBridge.getClusterName(), newTable); String oldColumnQFName = HiveMetaStoreBridge.getColumnQualifiedName(newQualifiedTableName, oldColName); String newColumnQFName = HiveMetaStoreBridge.getColumnQualifiedName(newQualifiedTableName, newColName); Referenceable newColEntity = new Referenceable(HiveDataTypes.HIVE_COLUMN.getName()); newColEntity.set(AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, newColumnQFName); event.addMessage( new HookNotification.EntityPartialUpdateRequest( event.getUser(), HiveDataTypes.HIVE_COLUMN.getName(), AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, oldColumnQFName, newColEntity)); } } handleEventOutputs(dgiBridge, event, Type.TABLE); }
@Test public void testLineage() throws Exception { String table1 = createTable(false); String db2 = createDatabase(); String table2 = tableName(); String query = String.format("create table %s.%s as select * from %s", db2, table2, table1); runCommand(query); String table1Id = assertTableIsRegistered(DEFAULT_DB, table1); String table2Id = assertTableIsRegistered(db2, table2); String datasetName = HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, db2, table2); JSONObject response = dgiCLient.getInputGraph(datasetName); JSONObject vertices = response.getJSONObject("values").getJSONObject("vertices"); Assert.assertTrue(vertices.has(table1Id)); Assert.assertTrue(vertices.has(table2Id)); datasetName = HiveMetaStoreBridge.getTableQualifiedName(CLUSTER_NAME, DEFAULT_DB, table1); response = dgiCLient.getOutputGraph(datasetName); vertices = response.getJSONObject("values").getJSONObject("vertices"); Assert.assertTrue(vertices.has(table1Id)); Assert.assertTrue(vertices.has(table2Id)); }
private Referenceable getProcessReferenceable( HiveMetaStoreBridge dgiBridge, HiveEventContext hiveEvent, SortedMap<Entity, Referenceable> source, SortedMap<Entity, Referenceable> target) { Referenceable processReferenceable = new Referenceable(HiveDataTypes.HIVE_PROCESS.getName()); String queryStr = lower(hiveEvent.getQueryStr()); processReferenceable.set( AtlasClient.REFERENCEABLE_ATTRIBUTE_NAME, getProcessQualifiedName(hiveEvent.getOperation(), source, target)); LOG.debug("Registering query: {}", queryStr); List<Referenceable> sourceList = new ArrayList<>(source.values()); List<Referenceable> targetList = new ArrayList<>(target.values()); // The serialization code expected a list if (sourceList != null && !sourceList.isEmpty()) { processReferenceable.set("inputs", sourceList); } if (targetList != null && !targetList.isEmpty()) { processReferenceable.set("outputs", targetList); } processReferenceable.set(AtlasClient.NAME, queryStr); processReferenceable.set("operationType", hiveEvent.getOperation().getOperationName()); processReferenceable.set("startTime", new Date(hiveEvent.getQueryStartTime())); processReferenceable.set("userName", hiveEvent.getUser()); processReferenceable.set("queryText", queryStr); processReferenceable.set("queryId", hiveEvent.getQueryId()); processReferenceable.set("queryPlan", hiveEvent.getJsonPlan()); processReferenceable.set(AtlasConstants.CLUSTER_NAME_ATTRIBUTE, dgiBridge.getClusterName()); List<String> recentQueries = new ArrayList<>(1); recentQueries.add(queryStr); processReferenceable.set("recentQueries", recentQueries); processReferenceable.set("endTime", new Date(System.currentTimeMillis())); // TODO set queryGraph return processReferenceable; }
private LinkedHashMap<Type, Referenceable> createOrUpdateEntities( HiveMetaStoreBridge dgiBridge, HiveEventContext event, Entity entity, boolean skipTempTables, Table existTable) throws Exception { Database db = null; Table table = null; Partition partition = null; LinkedHashMap<Type, Referenceable> result = new LinkedHashMap<>(); List<Referenceable> entities = new ArrayList<>(); switch (entity.getType()) { case DATABASE: db = entity.getDatabase(); break; case TABLE: table = entity.getTable(); db = dgiBridge.hiveClient.getDatabase(table.getDbName()); break; case PARTITION: partition = entity.getPartition(); table = partition.getTable(); db = dgiBridge.hiveClient.getDatabase(table.getDbName()); break; } db = dgiBridge.hiveClient.getDatabase(db.getName()); Referenceable dbEntity = dgiBridge.createDBInstance(db); entities.add(dbEntity); result.put(Type.DATABASE, dbEntity); Referenceable tableEntity = null; if (table != null) { if (existTable != null) { table = existTable; } else { table = dgiBridge.hiveClient.getTable(table.getDbName(), table.getTableName()); } // If its an external table, even though the temp table skip flag is on, // we create the table since we need the HDFS path to temp table lineage. if (skipTempTables && table.isTemporary() && !TableType.EXTERNAL_TABLE.equals(table.getTableType())) { LOG.debug( "Skipping temporary table registration {} since it is not an external table {} ", table.getTableName(), table.getTableType().name()); } else { tableEntity = dgiBridge.createTableInstance(dbEntity, table); entities.add(tableEntity); result.put(Type.TABLE, tableEntity); } } event.addMessage(new HookNotification.EntityUpdateRequest(event.getUser(), entities)); return result; }