####背景:####
spark graphx并未提供极大团挖掘算法
当下的极大团算法都是串行化的算法,基于Bron–Kerbosch算法
####思路:####
spark graphx提供了连通图的算法,连通图和极大团都是无向图中的概念,极大团为连通图的子集
利用spark graphx 找出连通图,在从各个连通图中,利用串行化的极大团算法,找出极大团 (伪并行化)
对于关联性较强的图,找出来的连通图非常大,这时串行化的极大团算法,仍然会耗时很久,这里利用剪枝的思想减少样本数据量,但是对于大图,优化空间有限
期待真正的并行化的极大团算法
####配置文件:####
- graph_data_path=hdfs://localhost/graph_data
- out_path=hdfs://localhost/clique
- ck_path=hdfs://localhost/checkpoint
- numIter=50 剪枝次数
- count=3 极大团顶点数大小
- algorithm=2 极大团算法,1:个人实现 2:jgrapht
- percent=90 剪枝后的顶点数,占前一次的百分比,如果剪完后,还剩下90%的数据,那么剪枝效率已然不高
- spark.master=local
- spark.app.name=graph
- spark.serializer=org.apache.spark.serializer.KryoSerializer
- spark.yarn.executor.memoryOverhead=20480
- spark.yarn.driver.memoryOverhead=20480
- spark.driver.extraJavaOptions=-XX:+UseG1GC -XX:+UseCompressedOops -XX:+DisableExplicitGC
- spark.executor.extraJavaOptions=-XX:+UseG1GC -XX:+UseCompressedOops -XX:+DisableExplicitGC
- spark.driver.maxResultSize=10g
- spark.default.parallelism=60
####样本数据:####
{"src":"0","dst":"1"} {"src":"0","dst":"2"} {"src":"0","dst":"3"} {"src":"1","dst":"0"} {"src":"2","dst":"1"} {"src":"3","dst":"5"} {"src":"4","dst":"6"} {"src":"5","dst":"4"} {"src":"6","dst":"5"} {"src":"3","dst":"2"} {"src":"2","dst":"3"} {"src":"6","dst":"4"} {"src":"3","dst":"4"} {"src":"4","dst":"3"} {"src":"2","dst":"6"} {"src":"6","dst":"2"} {"src":"6","dst":"7"} {"src":"7","dst":"6"}
####样本图:####
####输出:####
0,1,2 0,2,3 3,4,5 4,5,6
####代码实现:####
- import java.util import java.util.Properties
- import org.apache.spark.broadcast.Broadcast
- import org.apache.spark.graphx.{Edge, Graph}
- import org.apache.spark.rdd.RDD
- import org.apache.spark.sql.{Row, SQLContext}
- import org.apache.spark.storage.StorageLevel
- import org.apache.spark.{SparkConf, SparkContext}
- import org.jgrapht.alg.BronKerboschCliqueFinder
- import org.jgrapht.graph.{DefaultEdge, SimpleGraph}
- import scala.collection.JavaConverters._
- import scala.collection.mutable
- object ApplicationTitan {
- def main(args: Array[String]) {
- val prop = new Properties()
- prop.load(getClass.getResourceAsStream("/config.properties"))
- val graph_data_path = prop.getProperty("graph_data_path")
- val out_path = prop.getProperty("out_path")
- val ck_path = prop.getProperty("ck_path")
- val count = Integer.parseInt(prop.getProperty("count"))
- val numIter = Integer.parseInt(prop.getProperty("numIter"))
- val algorithm = Integer.parseInt(prop.getProperty("algorithm"))
- val percent = Integer.parseInt(prop.getProperty("percent"))
- val conf = new SparkConf()
- try {
- Runtime.getRuntime.exec("hdfs dfs -rm -r " + out_path)
- // Runtime.getRuntime.exec("cmd.exe /C rd /s /q " + out_path)
- } catch {
- case ex: Exception =>
- ex.printStackTrace(System.out)
- }
- prop.stringPropertyNames().asScala.foreach(s => {
- if (s.startsWith("spark")) {
- conf.set(s, prop.getProperty(s))
- }
- })
- conf.registerKryoClasses(Array(getClass))
- val sc = new SparkContext(conf)
- sc.setLogLevel("ERROR")
- sc.setCheckpointDir(ck_path)
- val sqlc = new SQLContext(sc)
- try {
- val e_df = sqlc.read
- // .json(graph_data_path)
- .parquet(graph_data_path)
- var e_rdd = e_df
- .mapPartitions(it => {
- it.map({
- case Row(dst: String, src: String) =>
- val src_long = src.toLong
- val dst_long = dst.toLong
- if (src_long < dst_long) (src_long, dst_long) else (dst_long, src_long)
- })
- }).distinct()
- e_rdd.persist(StorageLevel.MEMORY_AND_DISK_SER)
- var bc: Broadcast[Set[Long]] = null
- var iter = 0
- var bc_size = 0
- //剪枝
- while (iter <= numIter) {
- val temp = e_rdd
- .flatMap(x => List((x._1, 1), (x._2, 1)))
- .reduceByKey((x, y) => x + y)
- .filter(x => x._2 >= count - 1)
- .mapPartitions(it => it.map(x => x._1))
- val bc_value = temp.collect().toSet
- bc = sc.broadcast(bc_value)
- e_rdd = e_rdd.filter(x => bc.value.contains(x._1) && bc.value.contains(x._2))
- e_rdd.persist(StorageLevel.MEMORY_AND_DISK_SER)
- iter += 1
- if (bc_size != 0 && bc_value.size >= bc_size * percent / 100) {
- println("total iter : "+ iter)
- iter = Int.MaxValue
- }
- bc_size = bc_value.size
- }
- // 构造图
- val edge: RDD[Edge[Long]] = e_rdd.mapPartitions(it => it.map(x => Edge(x._1, x._2)))
- val graph = Graph.fromEdges(edge, 0, StorageLevel.MEMORY_AND_DISK_SER, StorageLevel.MEMORY_AND_DISK_SER)
- //连通图
- val cc = graph.connectedComponents().vertices
- cc.persist(StorageLevel.MEMORY_AND_DISK_SER)
- cc.join(e_rdd)
- .mapPartitions(it => it.map(x => ((math.random * 10).toInt.toString.concat(x._2._1.toString), (x._1, x._2._2))))
- .aggregateByKey(List[(Long, Long)]())((list, v) => list :+ v, (list1, list2) => list1 ::: list2)
- .mapPartitions(it => it.map(x => (x._1.substring(1), x._2)))
- .aggregateByKey(List[(Long, Long)]())((list1, list2) => list1 ::: list2, (list3, list4) => list3 ::: list4)
- .filter(x => x._2.size >= count - 1)
- .flatMap(x => {
- if (algorithm == 1)
- find(x, count)
- else
- find2(x, count)
- })
- .mapPartitions(it => {
- it.map({
- case set =>
- var temp = ""
- set.asScala.foreach(x => temp += x + ",")
- temp.substring(0, temp.length - 1)
- case _ =>
- })
- })
- // .coalesce(1)
- .saveAsTextFile(out_path)
- }
- catch {
- case ex: Exception =>
- ex.printStackTrace(System.out)
- }
- sc.stop()
- }
- //自己实现的极大团算法
- def find(x: (String, List[(Long, Long)]), count: Int): mutable.Set[util.Set[String]] = {
- println(x._1 + "|s|" + x._2.size)
- println("BKCliqueFinder---" + x._1 + "---" + System.currentTimeMillis())
- val neighbors = new util.HashMap[String, util.Set[String]]
- val finder = new CliqueFinder(neighbors, count)
- x._2.foreach(r => {
- val v1 = r._1.toString
- val v2 = r._2.toString
- if (neighbors.containsKey(v1)) {
- neighbors.get(v1).add(v2)
- } else {
- val temp = new util.HashSet[String]()
- temp.add(v2)
- neighbors.put(v1, temp)
- }
- if (neighbors.containsKey(v2)) {
- neighbors.get(v2).add(v1)
- } else {
- val temp = new util.HashSet[String]()
- temp.add(v1)
- neighbors.put(v2, temp)
- }
- })
- println("BKCliqueFinder---" + x._1 + "---" + System.currentTimeMillis())
- finder.findMaxCliques().asScala
- }
- //jgrapht 中的极大团算法
- def find2(x: (String, List[(Long, Long)]), count: Int): Set[util.Set[String]] = {
- println(x._1 + "|s|" + x._2.size)
- println("BKCliqueFinder---" + x._1 + "---" + System.currentTimeMillis())
- val to_clique = new SimpleGraph[String, DefaultEdge](classOf[DefaultEdge])
- x._2.foreach(r => {
- val v1 = r._1.toString
- val v2 = r._2.toString
- to_clique.addVertex(v1)
- to_clique.addVertex(v2)
- to_clique.addEdge(v1, v2)
- })
- val finder = new BronKerboschCliqueFinder(to_clique)
- val list = finder.getAllMaximalCliques.asScala
- var result = Set[util.Set[String]]()
- list.foreach(x => {
- if (x.size() >= count)
- result = result + x
- })
- println("BKCliqueFinder---" + x._1 + "---" + System.currentTimeMillis())
- result
- }
- }
//自己实现的极大团算法
- import java.util.*;
- /**
- * [@author](https://my.oschina.net/arthor) mopspecial@gmail.com
- * [@date](https://my.oschina.net/u/2504391) 2017/7/31
- */
- public class CliqueFinder {
- private Map<String, Set<String>> neighbors;
- private Set<String> nodes;
- private Set<Set<String>> maxCliques = new HashSet<>();
- private Integer minSize;
- public CliqueFinder(Map<String, Set<String>> neighbors, Integer minSize) {
- this.neighbors = neighbors;
- this.nodes = neighbors.keySet();
- this.minSize = minSize;
- }
- private void bk3(Set<String> clique, List<String> candidates, List<String> excluded) {
- if (candidates.isEmpty() && excluded.isEmpty()) {
- if (!clique.isEmpty() && clique.size() >= minSize) {
- maxCliques.add(clique);
- }
- return;
- }
- for (String s : degeneracy_order(candidates)) {
- List<String> new_candidates = new ArrayList<>(candidates);
- new_candidates.retainAll(neighbors.get(s));
- List<String> new_excluded = new ArrayList<>(excluded);
- new_excluded.retainAll(neighbors.get(s));
- Set<String> nextClique = new HashSet<>(clique);
- nextClique.add(s);
- bk2(nextClique, new_candidates, new_excluded);
- candidates.remove(s);
- excluded.add(s);
- }
- }
- private void bk2(Set<String> clique, List<String> candidates, List<String> excluded) {
- if (candidates.isEmpty() && excluded.isEmpty()) {
- if (!clique.isEmpty() && clique.size() >= minSize) {
- maxCliques.add(clique);
- }
- return;
- }
- String pivot = pick_random(candidates);
- if (pivot == null) {
- pivot = pick_random(excluded);
- }
- List<String> tempc = new ArrayList<>(candidates);
- tempc.removeAll(neighbors.get(pivot));
- for (String s : tempc) {
- List<String> new_candidates = new ArrayList<>(candidates);
- new_candidates.retainAll(neighbors.get(s));
- List<String> new_excluded = new ArrayList<>(excluded);
- new_excluded.retainAll(neighbors.get(s));
- Set<String> nextClique = new HashSet<>(clique);
- nextClique.add(s);
- bk2(nextClique, new_candidates, new_excluded);
- candidates.remove(s);
- excluded.add(s);
- }
- }
- private List<String> degeneracy_order(List<String> innerNodes) {
- List<String> result = new ArrayList<>();
- Map<String, Integer> deg = new HashMap<>();
- for (String node : innerNodes) {
- deg.put(node, neighbors.get(node).size());
- }
- while (!deg.isEmpty()) {
- Integer min = Collections.min(deg.values());
- String minKey = null;
- for (String key : deg.keySet()) {
- if (deg.get(key).equals(min)) {
- minKey = key;
- break;
- }
- }
- result.add(minKey);
- deg.remove(minKey);
- for (String k : neighbors.get(minKey)) {
- if (deg.containsKey(k)) {
- deg.put(k, deg.get(k) - 1);
- }
- }
- }
- return result;
- }
- private String pick_random(List<String> random) {
- if (random != null && !random.isEmpty()) {
- return random.get(0);
- } else {
- return null;
- }
- }
- public Set<Set<String>> findMaxCliques() {
- this.bk3(new HashSet<>(), new ArrayList<>(nodes), new ArrayList<>());
- return maxCliques;
- }
- public static void main(String[] args) {
- Map<String, Set<String>> neighbors = new HashMap<>();
- neighbors.put("0", new HashSet<>(Arrays.asList("1", "2", "3")));
- neighbors.put("1", new HashSet<>(Arrays.asList("0", "2")));
- neighbors.put("2", new HashSet<>(Arrays.asList("0", "1", "3", "6")));
- neighbors.put("3", new HashSet<>(Arrays.asList("0", "2", "4", "5")));
- neighbors.put("4", new HashSet<>(Arrays.asList("3", "5", "6")));
- neighbors.put("5", new HashSet<>(Arrays.asList("3", "4", "6")));
- neighbors.put("6", new HashSet<>(Arrays.asList("2", "4", "5")));
- neighbors.put("7", new HashSet<>(Arrays.asList("6")));
- CliqueFinder finder = new CliqueFinder(neighbors, 3);
- finder.bk3(new HashSet<>(), new ArrayList<>(neighbors.keySet()), new ArrayList<>());
- System.out.println(finder.maxCliques);
- }
- }