/** * Converts runtime edge data to persistent edge data (includes source/target vertex data) and * writes it to HBase. * * @param epgmDatabase EPGM database instance * @param edgeDataHandler edge data handler * @param persistentEdgeDataFactory persistent edge data factory * @param edgeDataTableName HBase edge data table name * @param <PED> persistent edge data type * @throws IOException */ public <PED extends PersistentEdgeData<VD>> void writeEdgeData( final EPGMDatabase<VD, ED, GD> epgmDatabase, final EdgeDataHandler<ED, VD> edgeDataHandler, final PersistentEdgeDataFactory<ED, VD, PED> persistentEdgeDataFactory, final String edgeDataTableName) throws IOException { Graph<Long, VD, ED> graph = epgmDatabase.getDatabaseGraph().getGellyGraph(); DataSet<PersistentEdgeData<VD>> persistentEdgeDataSet = graph .getVertices() // join vertex with edges on edge source vertex id .join(graph.getEdges()) .where(0) .equalTo(1) // join result with vertices on edge target vertex id .join(graph.getVertices()) .where("1.1") .equalTo(0) // ((source-vertex-data, edge-data), target-vertex-data) .with(new PersistentEdgeDataJoinFunction<>(persistentEdgeDataFactory)); // write (persistent-edge-data) to HBase table Job job = Job.getInstance(); job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, edgeDataTableName); persistentEdgeDataSet .map(new HBaseWriter.EdgeDataToHBaseMapper<>(edgeDataHandler)) .output(new HadoopOutputFormat<>(new TableOutputFormat<LongWritable>(), job)); }
@SuppressWarnings("serial") public static void main(String[] args) throws Exception { if (!parseParameters(args)) { return; } ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Edge<Long, NullValue>> edges = getEdgesDataSet(env); Graph<Long, Long, NullValue> graph = Graph.fromDataSet( edges, new MapFunction<Long, Long>() { @Override public Long map(Long value) throws Exception { return value; } }, env); DataSet<Vertex<Long, Long>> verticesWithMinIds = graph.run(new GSAConnectedComponents<Long, Long, NullValue>(maxIterations)); // emit result if (fileOutput) { verticesWithMinIds.writeAsCsv(outputPath, "\n", ","); // since file sinks are lazy, we trigger the execution explicitly env.execute("Connected Components Example"); } else { verticesWithMinIds.print(); } }
/** * Prints a graph as two files (edges, vertices) to the file system. * * @param g The graph. * @param filePathEdges File path for edges. * @param filePathVertices File path for vertices. */ @SuppressWarnings("rawtypes") public static void printGraph(Graph g, String filePathEdges, String filePathVertices) { try { g.getEdgesAsTuple3().writeAsCsv(filePathEdges, "\n", "\t", WriteMode.OVERWRITE); g.getVerticesAsTuple2().writeAsCsv(filePathVertices, "\n", "\t", WriteMode.OVERWRITE); } catch (Exception e) { e.printStackTrace(); } }
/** * Prints the edges and vertices of a graph <code>g</code> to the console. * * @param g A graph. * @return Result. */ @SuppressWarnings("rawtypes") public static String showEdgesVertices(Graph g) { String str = ""; try { str = "Edges:\n"; for (Object edge : g.getEdges().collect()) str += " " + edge + "\n"; str += "\nVertices:\n"; for (Object vertex : g.getVertices().collect()) str += " " + vertex + "\n"; } catch (Exception e1) { e1.printStackTrace(); } return str; }
@Override public DataSet<Edge<K, Tuple2<EV, LongValue>>> run(Graph<K, VV, EV> input) throws Exception { // t, d(t) DataSet<Vertex<K, LongValue>> vertexDegrees = input.run( new VertexDegree<K, VV, EV>() .setReduceOnTargetId(!reduceOnSourceId) .setParallelism(parallelism)); // s, t, d(t) return input .getEdges() .join(vertexDegrees, JoinHint.REPARTITION_HASH_SECOND) .where(1) .equalTo(0) .with(new JoinEdgeWithVertexDegree<K, EV>()) .setParallelism(parallelism) .name("Edge target degree"); }
@Test public void testWithSimpleGraph() throws Exception { Graph<IntValue, NullValue, NullValue> graph = undirectedSimpleGraph.run(new MaximumDegree<IntValue, NullValue, NullValue>(3)); String expectedVerticesResult = "(0,(null))\n" + "(1,(null))\n" + "(2,(null))\n" + "(4,(null))\n" + "(5,(null))"; TestBaseUtils.compareResultAsText(graph.getVertices().collect(), expectedVerticesResult); String expectedEdgesResult = "(0,1,(null))\n" + "(0,2,(null))\n" + "(1,0,(null))\n" + "(1,2,(null))\n" + "(2,0,(null))\n" + "(2,1,(null))"; TestBaseUtils.compareResultAsText(graph.getEdges().collect(), expectedEdgesResult); }
@Override public VertexMetrics<K, VV, EV> run(Graph<K, VV, EV> input) throws Exception { super.run(input); DataSet<Vertex<K, Degrees>> vertexDegree = input.run( new VertexDegrees<K, VV, EV>() .setIncludeZeroDegreeVertices(includeZeroDegreeVertices) .setParallelism(parallelism)); vertexDegree.output(new VertexMetricsHelper<K>(id)).name("Vertex metrics"); return this; }
/** {@inheritDoc} */ @Override protected LogicalGraph<VD, ED, GD> executeInternal( LogicalGraph<VD, ED, GD> firstGraph, LogicalGraph<VD, ED, GD> secondGraph) { final Long newGraphID = FlinkConstants.EXCLUDE_GRAPH_ID; Graph<Long, VD, ED> graph1 = firstGraph.getGellyGraph(); Graph<Long, VD, ED> graph2 = secondGraph.getGellyGraph(); // union vertex sets, group by vertex id, filter vertices where the group // contains exactly one vertex which belongs to the graph, the operator is // called on DataSet<Vertex<Long, VD>> newVertexSet = graph1 .getVertices() .union(graph2.getVertices()) .groupBy(new KeySelectors.VertexKeySelector<VD>()) .reduceGroup(new VertexGroupReducer<VD>(1L, firstGraph.getId(), secondGraph.getId())) .map(new VertexToGraphUpdater<VD>(newGraphID)); JoinFunction<Edge<Long, ED>, Vertex<Long, VD>, Edge<Long, ED>> joinFunc = new JoinFunction<Edge<Long, ED>, Vertex<Long, VD>, Edge<Long, ED>>() { @Override public Edge<Long, ED> join(Edge<Long, ED> leftTuple, Vertex<Long, VD> rightTuple) throws Exception { return leftTuple; } }; // In exclude(), we are only interested in edges that connect vertices // that are in the exclusion of the vertex sets. Thus, we join the edges // from the left graph with the new vertex set using source and target ids. DataSet<Edge<Long, ED>> newEdgeSet = graph1 .getEdges() .join(newVertexSet) .where(new KeySelectors.EdgeSourceVertexKeySelector<ED>()) .equalTo(new KeySelectors.VertexKeySelector<VD>()) .with(joinFunc) .join(newVertexSet) .where(new KeySelectors.EdgeTargetVertexKeySelector<ED>()) .equalTo(new KeySelectors.VertexKeySelector<VD>()) .with(joinFunc) .map(new EdgeToGraphUpdater<ED>(newGraphID)); return LogicalGraph.fromGraph( Graph.fromDataSet(newVertexSet, newEdgeSet, graph1.getContext()), firstGraph.getGraphDataFactory().createGraphData(newGraphID), firstGraph.getVertexDataFactory(), firstGraph.getEdgeDataFactory(), firstGraph.getGraphDataFactory()); }
@Test public void testTranslation() { try { final String ITERATION_NAME = "Test Name"; final String AGGREGATOR_NAME = "AggregatorName"; final String BC_SET_GATHER_NAME = "gather messages"; final String BC_SET_SUM_NAME = "sum updates"; final String BC_SET_APLLY_NAME = "apply updates"; final int NUM_ITERATIONS = 13; final int ITERATION_parallelism = 77; ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); DataSet<Long> bcGather = env.fromElements(1L); DataSet<Long> bcSum = env.fromElements(1L); DataSet<Long> bcApply = env.fromElements(1L); DataSet<Vertex<Long, Long>> result; // ------------ construct the test program ------------------ { DataSet<Edge<Long, NullValue>> edges = env.fromElements(new Tuple3<Long, Long, NullValue>(1L, 2L, NullValue.getInstance())) .map(new Tuple3ToEdgeMap<Long, NullValue>()); Graph<Long, Long, NullValue> graph = Graph.fromDataSet(edges, new InitVertices(), env); GSAConfiguration parameters = new GSAConfiguration(); parameters.registerAggregator(AGGREGATOR_NAME, new LongSumAggregator()); parameters.setName(ITERATION_NAME); parameters.setParallelism(ITERATION_parallelism); parameters.addBroadcastSetForGatherFunction(BC_SET_GATHER_NAME, bcGather); parameters.addBroadcastSetForSumFunction(BC_SET_SUM_NAME, bcSum); parameters.addBroadcastSetForApplyFunction(BC_SET_APLLY_NAME, bcApply); result = graph .runGatherSumApplyIteration( new GatherNeighborIds(), new SelectMinId(), new UpdateComponentId(), NUM_ITERATIONS, parameters) .getVertices(); result.output(new DiscardingOutputFormat<Vertex<Long, Long>>()); } // ------------- validate the java program ---------------- assertTrue(result instanceof DeltaIterationResultSet); DeltaIterationResultSet<?, ?> resultSet = (DeltaIterationResultSet<?, ?>) result; DeltaIteration<?, ?> iteration = (DeltaIteration<?, ?>) resultSet.getIterationHead(); // check the basic iteration properties assertEquals(NUM_ITERATIONS, resultSet.getMaxIterations()); assertArrayEquals(new int[] {0}, resultSet.getKeyPositions()); assertEquals(ITERATION_parallelism, iteration.getParallelism()); assertEquals(ITERATION_NAME, iteration.getName()); assertEquals( AGGREGATOR_NAME, iteration.getAggregators().getAllRegisteredAggregators().iterator().next().getName()); // validate that the semantic properties are set as they should TwoInputUdfOperator<?, ?, ?, ?> solutionSetJoin = (TwoInputUdfOperator<?, ?, ?, ?>) resultSet.getNextWorkset(); assertTrue( solutionSetJoin.getSemanticProperties().getForwardingTargetFields(0, 0).contains(0)); assertTrue( solutionSetJoin.getSemanticProperties().getForwardingTargetFields(1, 0).contains(0)); SingleInputUdfOperator<?, ?, ?> sumReduce = (SingleInputUdfOperator<?, ?, ?>) solutionSetJoin.getInput1(); SingleInputUdfOperator<?, ?, ?> gatherMap = (SingleInputUdfOperator<?, ?, ?>) sumReduce.getInput(); // validate that the broadcast sets are forwarded assertEquals(bcGather, gatherMap.getBroadcastSets().get(BC_SET_GATHER_NAME)); assertEquals(bcSum, sumReduce.getBroadcastSets().get(BC_SET_SUM_NAME)); assertEquals(bcApply, solutionSetJoin.getBroadcastSets().get(BC_SET_APLLY_NAME)); } catch (Exception e) { System.err.println(e.getMessage()); e.printStackTrace(); fail(e.getMessage()); } }
/** * Converts runtime vertex data to persistent vertex data (includes incoming and outgoing edge * data) and writes it to HBase. * * @param epgmDatabase EPGM database instance * @param vertexDataHandler vertex data handler * @param persistentVertexDataFactory persistent vertex data factory * @param vertexDataTableName HBase vertex data table name * @param <PVD> persistent vertex data type * @throws Exception */ public <PVD extends PersistentVertexData<ED>> void writeVertexData( final EPGMDatabase<VD, ED, GD> epgmDatabase, final VertexDataHandler<VD, ED> vertexDataHandler, final PersistentVertexDataFactory<VD, ED, PVD> persistentVertexDataFactory, final String vertexDataTableName) throws Exception { final Graph<Long, VD, ED> graph = epgmDatabase.getDatabaseGraph().getGellyGraph(); // group edges by source vertex id (vertex-id, [out-edge-data]) GroupReduceOperator<Edge<Long, ED>, Tuple2<Long, Set<ED>>> vertexToOutgoingEdges = graph .getEdges() .groupBy(0) // group by source vertex id .reduceGroup( new GroupReduceFunction<Edge<Long, ED>, Tuple2<Long, Set<ED>>>() { @Override public void reduce( Iterable<Edge<Long, ED>> edgeIterable, Collector<Tuple2<Long, Set<ED>>> collector) throws Exception { Set<ED> outgoingEdgeData = Sets.newHashSet(); Long vertexId = null; boolean initialized = false; for (Edge<Long, ED> edgeData : edgeIterable) { if (!initialized) { vertexId = edgeData.getSource(); initialized = true; } outgoingEdgeData.add(edgeData.getValue()); } collector.collect(new Tuple2<>(vertexId, outgoingEdgeData)); } }); // group edges by target vertex id (vertex-id, [in-edge-data]) GroupReduceOperator<Edge<Long, ED>, Tuple2<Long, Set<ED>>> vertexToIncomingEdges = graph .getEdges() .groupBy(1) // group by target vertex id .reduceGroup( new GroupReduceFunction<Edge<Long, ED>, Tuple2<Long, Set<ED>>>() { @Override public void reduce( Iterable<Edge<Long, ED>> edgeIterable, Collector<Tuple2<Long, Set<ED>>> collector) throws Exception { Set<ED> outgoingEdgeData = Sets.newHashSet(); Long vertexId = null; boolean initialized = false; for (Edge<Long, ED> edgeData : edgeIterable) { if (!initialized) { vertexId = edgeData.getTarget(); initialized = true; } outgoingEdgeData.add(edgeData.getValue()); } collector.collect(new Tuple2<>(vertexId, outgoingEdgeData)); } }); // co-group (vertex-data) with (vertex-id, [out-edge-data]) to simulate left // outer join DataSet<Tuple2<Vertex<Long, VD>, Set<ED>>> vertexDataWithOutgoingEdges = graph .getVertices() .coGroup(vertexToOutgoingEdges) .where(0) .equalTo(0) .with( new CoGroupFunction< Vertex<Long, VD>, Tuple2<Long, Set<ED>>, Tuple2<Vertex<Long, VD>, Set<ED>>>() { @Override public void coGroup( Iterable<Vertex<Long, VD>> vertexIterable, Iterable<Tuple2<Long, Set<ED>>> outEdgesIterable, Collector<Tuple2<Vertex<Long, VD>, Set<ED>>> collector) throws Exception { Vertex<Long, VD> vertex = null; Set<ED> outgoingEdgeData = null; // read vertex data from left group for (Vertex<Long, VD> v : vertexIterable) { vertex = v; } // read outgoing edge from right group (may be empty) for (Tuple2<Long, Set<ED>> oEdges : outEdgesIterable) { outgoingEdgeData = oEdges.f1; } collector.collect(new Tuple2<>(vertex, outgoingEdgeData)); } }); // co-group (vertex-data, (vertex-id, [out-edge-data])) with (vertex-id, // [in-edge-data]) to simulate left outer join DataSet<PersistentVertexData<ED>> persistentVertexDataSet = vertexDataWithOutgoingEdges .coGroup(vertexToIncomingEdges) .where("0.0") .equalTo(0) .with(new PersistentVertexDataCoGroupFunction<>(persistentVertexDataFactory)); // write (persistent-vertex-data) to HBase table Job job = Job.getInstance(); job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, vertexDataTableName); persistentVertexDataSet .map(new HBaseWriter.VertexDataToHBaseMapper<>(vertexDataHandler)) .output(new HadoopOutputFormat<>(new TableOutputFormat<LongWritable>(), job)); }
/** * Converts runtime graph data to persistent graph data (including vertex and edge identifiers) * and writes it to HBase. * * @param epgmDatabase EPGM database instance * @param graphDataHandler graph data handler * @param persistentGraphDataFactory persistent graph data factory * @param graphDataTableName HBase graph data table name * @param <PGD> persistent graph data type * @throws IOException */ public <PGD extends PersistentGraphData> void writeGraphData( final EPGMDatabase<VD, ED, GD> epgmDatabase, final GraphDataHandler<GD> graphDataHandler, final PersistentGraphDataFactory<GD, PGD> persistentGraphDataFactory, final String graphDataTableName) throws IOException { final Graph<Long, VD, ED> graph = epgmDatabase.getDatabaseGraph().getGellyGraph(); // build (graph-id, vertex-id) tuples from vertices FlatMapOperator<Vertex<Long, VD>, Tuple2<Long, Long>> graphIdToVertexId = graph .getVertices() .flatMap( new FlatMapFunction<Vertex<Long, VD>, Tuple2<Long, Long>>() { @Override public void flatMap( Vertex<Long, VD> vertex, Collector<Tuple2<Long, Long>> collector) throws Exception { if (vertex.getValue().getGraphCount() > 0) { for (Long graphID : vertex.getValue().getGraphs()) { collector.collect(new Tuple2<>(graphID, vertex.f0)); } } } }); // build (graph-id, edge-id) tuples from vertices FlatMapOperator<Edge<Long, ED>, Tuple2<Long, Long>> graphIdToEdgeId = graph .getEdges() .flatMap( new FlatMapFunction<Edge<Long, ED>, Tuple2<Long, Long>>() { @Override public void flatMap(Edge<Long, ED> edge, Collector<Tuple2<Long, Long>> collector) throws Exception { if (edge.getValue().getGraphCount() > 0) { for (Long graphId : edge.getValue().getGraphs()) { collector.collect(new Tuple2<>(graphId, edge.getValue().getId())); } } } }); // co-group (graph-id, vertex-id) and (graph-id, edge-id) tuples to // (graph-id, {vertex-id}, {edge-id}) triples CoGroupOperator<Tuple2<Long, Long>, Tuple2<Long, Long>, Tuple3<Long, Set<Long>, Set<Long>>> graphToVertexIdsAndEdgeIds = graphIdToVertexId .coGroup(graphIdToEdgeId) .where(0) .equalTo(0) .with( new CoGroupFunction< Tuple2<Long, Long>, Tuple2<Long, Long>, Tuple3<Long, Set<Long>, Set<Long>>>() { @Override public void coGroup( Iterable<Tuple2<Long, Long>> graphToVertexIds, Iterable<Tuple2<Long, Long>> graphToEdgeIds, Collector<Tuple3<Long, Set<Long>, Set<Long>>> collector) throws Exception { Set<Long> vertexIds = Sets.newHashSet(); Set<Long> edgeIds = Sets.newHashSet(); Long graphId = null; boolean initialized = false; for (Tuple2<Long, Long> graphToVertexTuple : graphToVertexIds) { if (!initialized) { graphId = graphToVertexTuple.f0; initialized = true; } vertexIds.add(graphToVertexTuple.f1); } for (Tuple2<Long, Long> graphToEdgeTuple : graphToEdgeIds) { edgeIds.add(graphToEdgeTuple.f1); } collector.collect(new Tuple3<>(graphId, vertexIds, edgeIds)); } }); // join (graph-id, {vertex-id}, {edge-id}) triples with // (graph-id, graph-data) and build (persistent-graph-data) JoinOperator.EquiJoin< Tuple3<Long, Set<Long>, Set<Long>>, Subgraph<Long, GD>, PersistentGraphData> persistentGraphDataSet = graphToVertexIdsAndEdgeIds .join(epgmDatabase.getCollection().getSubgraphs()) .where(0) .equalTo(0) .with(new PersistentGraphDataJoinFunction<>(persistentGraphDataFactory)); // write (persistent-graph-data) to HBase table Job job = Job.getInstance(); job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, graphDataTableName); persistentGraphDataSet .map(new HBaseWriter.GraphDataToHBaseMapper<>(graphDataHandler)) .output(new HadoopOutputFormat<>(new TableOutputFormat<LongWritable>(), job)); }