From e2597c80cef9dcd6dc1809b26131a7b44fdac5b7 Mon Sep 17 00:00:00 2001 From: xiongyutian Date: Tue, 7 Feb 2023 17:15:11 +0800 Subject: [PATCH] update kaltest-spark3.1.1 --- tools/kal-test/README.md | 51 ++ .../bin/compare/graph/3rd_para_compare.sh | 70 +++ .../bin/compare/graph/deepwalk_compare.sh | 70 +++ .../bin/compare/graph/incpr_compare.sh | 51 ++ .../kal-test/bin/compare/graph/lpa_compare.sh | 36 ++ .../bin/compare/graph/node2vec_compare.sh | 33 + .../kal-test/bin/compare/graph/ppr_compare.sh | 30 + tools/kal-test/bin/compare/ml/idf_compare.sh | 42 ++ tools/kal-test/bin/compare/ml/knn_compare.sh | 109 ++++ .../kal-test/bin/compare/ml/major_compare.sh | 60 ++ .../bin/compare/ml/simrank_compare.sh | 39 ++ tools/kal-test/bin/graph/betweenness_run.sh | 195 ++++++ tools/kal-test/bin/graph/bfs_run.sh | 183 ++++++ tools/kal-test/bin/graph/cc_run.sh | 130 ++++ tools/kal-test/bin/graph/cd_run.sh | 113 ++++ tools/kal-test/bin/graph/closeness_run.sh | 129 ++++ .../kal-test/bin/graph/closeness_run_hive.sh | 71 +++ .../bin/graph/clusteringcoefficient_run.sh | 157 +++++ tools/kal-test/bin/graph/deepwalk_run.sh | 144 +++++ tools/kal-test/bin/graph/degree_run.sh | 161 +++++ tools/kal-test/bin/graph/fraudar_run.sh | 135 ++++ tools/kal-test/bin/graph/inccc_run.sh | 125 ++++ tools/kal-test/bin/graph/incpr_run.sh | 136 ++++ tools/kal-test/bin/graph/katz_run.sh | 116 ++++ tools/kal-test/bin/graph/kcore_run.sh | 124 ++++ tools/kal-test/bin/graph/kcore_run_hive.sh | 57 ++ tools/kal-test/bin/graph/louvain_run.sh | 149 +++++ tools/kal-test/bin/graph/louvain_run_hive.sh | 69 ++ tools/kal-test/bin/graph/lpa_run.sh | 125 ++++ tools/kal-test/bin/graph/mce_run.sh | 102 +++ tools/kal-test/bin/graph/mce_run_hive.sh | 64 ++ tools/kal-test/bin/graph/modularity_run.sh | 120 ++++ tools/kal-test/bin/graph/mssp_run.sh | 114 ++++ tools/kal-test/bin/graph/node2vec_run.sh | 161 +++++ tools/kal-test/bin/graph/ppr_run.sh | 206 ++++++ tools/kal-test/bin/graph/pr_run.sh | 134 ++++ tools/kal-test/bin/graph/pr_run_hive.sh | 66 ++ tools/kal-test/bin/graph/scc_run.sh | 149 +++++ tools/kal-test/bin/graph/sgm_run.sh | 237 +++++++ tools/kal-test/bin/graph/tc_run.sh | 144 +++++ tools/kal-test/bin/graph/tpr_run.sh | 124 ++++ tools/kal-test/bin/graph/tr_run.sh | 117 ++++ tools/kal-test/bin/graph/wce_run.sh | 88 +++ tools/kal-test/bin/graph/wce_run_hive.sh | 48 ++ tools/kal-test/bin/graph/wlpa_run.sh | 117 ++++ tools/kal-test/bin/graph/wpr_run.sh | 148 +++++ tools/kal-test/bin/graph_workflow.sh | 312 ++++++++++ tools/kal-test/bin/ml/als_run.sh | 137 ++++ tools/kal-test/bin/ml/bo_run.sh | 129 ++++ tools/kal-test/bin/ml/cov_run.sh | 145 +++++ tools/kal-test/bin/ml/dbscan_run.sh | 169 +++++ tools/kal-test/bin/ml/dt_run.sh | 145 +++++ tools/kal-test/bin/ml/dtb_run.sh | 148 +++++ tools/kal-test/bin/ml/encoder_run.sh | 134 ++++ tools/kal-test/bin/ml/fm_run.sh | 133 ++++ tools/kal-test/bin/ml/fpg_run.sh | 124 ++++ tools/kal-test/bin/ml/gbdt_run.sh | 132 ++++ tools/kal-test/bin/ml/hdb_run.sh | 124 ++++ tools/kal-test/bin/ml/idf_run.sh | 125 ++++ tools/kal-test/bin/ml/if_run.sh | 121 ++++ tools/kal-test/bin/ml/kmeans_run.sh | 137 ++++ tools/kal-test/bin/ml/knn_run.sh | 145 +++++ tools/kal-test/bin/ml/lda_run.sh | 132 ++++ tools/kal-test/bin/ml/lgbm_run.sh | 131 ++++ tools/kal-test/bin/ml/linR_run.sh | 146 +++++ tools/kal-test/bin/ml/logR_run.sh | 125 ++++ tools/kal-test/bin/ml/nmf_run.sh | 128 ++++ tools/kal-test/bin/ml/pca_run.sh | 136 ++++ tools/kal-test/bin/ml/pearson_run.sh | 144 +++++ tools/kal-test/bin/ml/ps_run.sh | 135 ++++ tools/kal-test/bin/ml/rf_run.sh | 149 +++++ tools/kal-test/bin/ml/simrank_run.sh | 134 ++++ tools/kal-test/bin/ml/spca_run.sh | 148 +++++ tools/kal-test/bin/ml/spearman_run.sh | 145 +++++ tools/kal-test/bin/ml/svd_run.sh | 147 +++++ tools/kal-test/bin/ml/svm_run.sh | 130 ++++ tools/kal-test/bin/ml/te_run.sh | 132 ++++ tools/kal-test/bin/ml/word2vec_run.sh | 151 +++++ tools/kal-test/bin/ml/xgbt_run.sh | 153 +++++ tools/kal-test/bin/ml_workflow.sh | 414 ++++++++++++ .../preprocess/graph/incpr_data_process.sh | 59 ++ .../bin/preprocess/graph/tpr_data_process.sh | 36 ++ .../bin/preprocess/ml/encoder_data_gen.sh | 58 ++ .../conf/graph/betweenness/betweenness.yml | 17 + .../betweenness/betweenness_spark.properties | 53 ++ tools/kal-test/conf/graph/bfs/bfs.yml | 32 + .../conf/graph/bfs/bfs_source_id.properties | 13 + .../conf/graph/bfs/bfs_spark.properties | 78 +++ tools/kal-test/conf/graph/cc/cc.yml | 14 + .../conf/graph/cc/cc_spark.properties | 19 + tools/kal-test/conf/graph/cd/cd.yml | 18 + .../conf/graph/cd/cd_spark.properties | 33 + .../closeness/closeness_spark.properties | 79 +++ .../clusteringcoefficient.yml | 38 ++ .../clusteringcoefficient_spark.properties | 60 ++ .../kal-test/conf/graph/deepwalk/deepwalk.yml | 56 ++ .../graph/deepwalk/deepwalk_spark.properties | 56 ++ tools/kal-test/conf/graph/degree/degree.yml | 19 + .../conf/graph/degree/degree_spark.properties | 99 +++ tools/kal-test/conf/graph/fraudar/fraudar.yml | 26 + .../graph/fraudar/fraudar_spark.properties | 33 + .../conf/graph/graph_datasets.properties | 83 +++ tools/kal-test/conf/graph/inccc/inccc.yml | 13 + .../conf/graph/inccc/inccc_spark.properties | 30 + tools/kal-test/conf/graph/incpr/incpr.yml | 8 + .../conf/graph/incpr/incpr_spark.properties | 10 + tools/kal-test/conf/graph/katz/katz.yml | 26 + .../conf/graph/katz/katz_spark.properties | 30 + tools/kal-test/conf/graph/kcore/kcore.yml | 25 + .../conf/graph/kcore/kcore_spark.properties | 47 ++ tools/kal-test/conf/graph/louvain/louvain.yml | 32 + .../graph/louvain/louvain_spark.properties | 74 +++ tools/kal-test/conf/graph/lpa/lpa.yml | 9 + .../conf/graph/lpa/lpa_spark.properties | 33 + tools/kal-test/conf/graph/mce/mce.yml | 10 + .../conf/graph/mce/mce_spark.properties | 9 + .../conf/graph/modularity/modularity.yml | 44 ++ .../modularity/modularity_spark.properties | 63 ++ .../conf/graph/mssp/mssp_spark.properties | 9 + .../kal-test/conf/graph/node2vec/node2vec.yml | 41 ++ .../graph/node2vec/node2vec_spark.properties | 14 + tools/kal-test/conf/graph/ppr/ppr.yml | 23 + .../conf/graph/ppr/ppr_source_id.properties | 7 + .../conf/graph/ppr/ppr_spark.properties | 111 ++++ tools/kal-test/conf/graph/pr/pr.yml | 20 + .../conf/graph/pr/pr_spark.properties | 74 +++ tools/kal-test/conf/graph/scc/scc.yml | 4 + .../conf/graph/scc/scc_spark.properties | 20 + tools/kal-test/conf/graph/sgm/sgm.yml | 6 + .../conf/graph/sgm/sgm_spark.properties | 367 +++++++++++ tools/kal-test/conf/graph/tc/tc.yml | 8 + .../conf/graph/tc/tc_spark.properties | 11 + tools/kal-test/conf/graph/tpr/tpr.yml | 23 + .../conf/graph/tpr/tpr_spark.properties | 11 + tools/kal-test/conf/graph/tr/tr.yml | 64 ++ .../conf/graph/tr/tr_spark.properties | 219 +++++++ tools/kal-test/conf/graph/wce/wce.yml | 20 + .../conf/graph/wce/wce_spark.properties | 30 + tools/kal-test/conf/graph/wlpa/wlpa.yml | 32 + .../conf/graph/wlpa/wlpa_spark.properties | 33 + tools/kal-test/conf/graph/wpr/wpr.yml | 30 + .../conf/graph/wpr/wpr_spark.properties | 51 ++ tools/kal-test/conf/ml/als/als.yml | 127 ++++ .../kal-test/conf/ml/als/als_spark.properties | 15 + tools/kal-test/conf/ml/bo/bo.yml | 22 + tools/kal-test/conf/ml/bo/bo_spark.properties | 15 + tools/kal-test/conf/ml/cov/cov.yml | 22 + .../kal-test/conf/ml/cov/cov_spark.properties | 19 + tools/kal-test/conf/ml/dbscan/dbscan.yml | 20 + .../conf/ml/dbscan/dbscan_spark.properties | 26 + tools/kal-test/conf/ml/dt/dt.yml | 354 +++++++++++ tools/kal-test/conf/ml/dt/dt_spark.properties | 71 +++ tools/kal-test/conf/ml/dtb/dtb.yml | 50 ++ .../kal-test/conf/ml/dtb/dtb_spark.properties | 29 + tools/kal-test/conf/ml/encoder/encoder.yml | 20 + .../conf/ml/encoder/encoder_spark.properties | 33 + tools/kal-test/conf/ml/fm/fm.yml | 136 ++++ tools/kal-test/conf/ml/fm/fm_spark.properties | 89 +++ tools/kal-test/conf/ml/fpg/fpg.yml | 57 ++ .../kal-test/conf/ml/fpg/fpg_spark.properties | 43 ++ tools/kal-test/conf/ml/gbdt/gbdt.yml | 78 +++ .../conf/ml/gbdt/gbdt_spark.properties | 16 + tools/kal-test/conf/ml/hdb/hdb.yml | 28 + .../kal-test/conf/ml/hdb/hdb_spark.properties | 31 + tools/kal-test/conf/ml/idf/idf.yml | 25 + .../kal-test/conf/ml/idf/idf_spark.properties | 33 + tools/kal-test/conf/ml/if/if.yml | 52 ++ tools/kal-test/conf/ml/if/if_spark.properties | 25 + tools/kal-test/conf/ml/kmeans/kmeans.yml | 47 ++ .../conf/ml/kmeans/kmeans_spark.properties | 18 + tools/kal-test/conf/ml/knn/knn.yml | 59 ++ .../kal-test/conf/ml/knn/knn_spark.properties | 54 ++ tools/kal-test/conf/ml/lda/lda.yml | 58 ++ .../kal-test/conf/ml/lda/lda_spark.properties | 16 + tools/kal-test/conf/ml/lgbm/lgbm.yml | 294 +++++++++ .../conf/ml/lgbm/lgbm_spark.properties | 51 ++ tools/kal-test/conf/ml/linR/linR.yml | 58 ++ .../conf/ml/linR/linR_spark.properties | 19 + tools/kal-test/conf/ml/logR/logR.yml | 76 +++ .../conf/ml/logR/logR_spark.properties | 16 + tools/kal-test/conf/ml/ml_datasets.properties | 102 +++ tools/kal-test/conf/ml/nmf/nmf.yml | 52 ++ .../kal-test/conf/ml/nmf/nmf_spark.properties | 51 ++ tools/kal-test/conf/ml/pca/pca.yml | 53 ++ .../kal-test/conf/ml/pca/pca_spark.properties | 55 ++ tools/kal-test/conf/ml/pearson/pearson.yml | 36 ++ .../conf/ml/pearson/pearson_spark.properties | 19 + tools/kal-test/conf/ml/ps/ps.yml | 65 ++ tools/kal-test/conf/ml/ps/ps_spark.properties | 16 + tools/kal-test/conf/ml/rf/rf.yml | 466 ++++++++++++++ tools/kal-test/conf/ml/rf/rf_spark.properties | 91 +++ tools/kal-test/conf/ml/simrank/simrank.yml | 64 ++ .../conf/ml/simrank/simrank_spark.properties | 18 + tools/kal-test/conf/ml/spca/spca.yml | 75 +++ .../conf/ml/spca/spca_spark.properties | 70 +++ tools/kal-test/conf/ml/spearman/spearman.yml | 22 + .../ml/spearman/spearman_spark.properties | 54 ++ tools/kal-test/conf/ml/svd/svd.yml | 67 ++ .../kal-test/conf/ml/svd/svd_spark.properties | 69 ++ tools/kal-test/conf/ml/svm/svm.yml | 39 ++ .../kal-test/conf/ml/svm/svm_spark.properties | 18 + tools/kal-test/conf/ml/te/te.yml | 61 ++ tools/kal-test/conf/ml/te/te_spark.properties | 51 ++ tools/kal-test/conf/ml/word2vec/word2vec.yml | 182 ++++++ .../ml/word2vec/word2vec_spark.properties | 49 ++ tools/kal-test/conf/ml/xgbt/xgbt.yml | 252 ++++++++ .../conf/ml/xgbt/xgbt_spark.properties | 77 +++ tools/kal-test/pom.xml | 129 ++++ .../com/bigdata/compare/graph/BFSVerify.scala | 30 + .../graph/BetweennessClosenessVerify.scala | 33 + .../com/bigdata/compare/graph/CCVerify.scala | 28 + .../compare/graph/CDDegreeVerify.scala | 20 + .../graph/ClusteringCoefficientTCVerify.scala | 28 + .../compare/graph/DeepWalkVerify.scala | 156 +++++ .../compare/graph/IncPageRankVerify.scala | 171 +++++ .../bigdata/compare/graph/KCoreVerify.scala | 31 + .../com/bigdata/compare/graph/LpaVerify.scala | 30 + .../bigdata/compare/graph/MceWceVerify.scala | 28 + .../bigdata/compare/graph/MsspVerify.scala | 60 ++ .../compare/graph/Node2vecVerify.scala | 47 ++ .../compare/graph/PageRankVerify.scala | 34 + .../graph/PersonalizedPageRankVerify.scala | 77 +++ .../com/bigdata/compare/graph/SCCVerify.scala | 42 ++ .../graph/TrillionPageRankVerify.scala | 33 + .../compare/graph/TrustRankVerify.scala | 40 ++ .../graph/WeightedPageRankVerify.scala | 29 + .../com/bigdata/compare/ml/DTBVerify.scala | 44 ++ .../compare/ml/DownEvaluationVerify.scala | 41 ++ .../bigdata/compare/ml/EncoderVerify.scala | 57 ++ .../bigdata/compare/ml/EvaluationVerify.scala | 42 ++ .../com/bigdata/compare/ml/FPGVerify.scala | 53 ++ .../com/bigdata/compare/ml/IDFVerify.scala | 61 ++ .../com/bigdata/compare/ml/KNNVerify.scala | 214 +++++++ .../com/bigdata/compare/ml/LDAVerify.scala | 42 ++ .../com/bigdata/compare/ml/MatrixVerify.scala | 88 +++ .../bigdata/compare/ml/PrefixSpanVerify.scala | 58 ++ .../com/bigdata/compare/ml/SVDVerify.scala | 115 ++++ .../bigdata/compare/ml/SimRankVerify.scala | 61 ++ .../com/bigdata/compare/ml/TEVerify.scala | 87 +++ .../compare/ml/UpEvaluationVerify.scala | 42 ++ .../compare/ml/Word2VecEvaluation.scala | 131 ++++ .../scala/com/bigdata/graph/BFSRunner.scala | 119 ++++ .../com/bigdata/graph/BetweennessRunner.scala | 110 ++++ .../bigdata/graph/ClosenessHiveRunner.scala | 85 +++ .../com/bigdata/graph/ClosenessRunner.scala | 122 ++++ .../graph/ClusteringCoefficientRunner.scala | 140 +++++ .../graph/ConnectedComponentsRunner.scala | 100 +++ .../CycleDetectionWithConstrainsRunner.scala | 122 ++++ .../com/bigdata/graph/DeepWalkRunner.scala | 111 ++++ .../com/bigdata/graph/DegreeRunner.scala | 112 ++++ .../com/bigdata/graph/FraudarRunner.scala | 324 ++++++++++ .../graph/IncConnectedComponentsRunner.scala | 108 ++++ .../com/bigdata/graph/IncPageRankRunner.scala | 123 ++++ .../graph/KCoreDecompositionHiveRunner.scala | 72 +++ .../graph/KCoreDecompositionRunner.scala | 97 +++ .../com/bigdata/graph/KatzCentrality.scala | 107 ++++ .../scala/com/bigdata/graph/KcoreMain.scala | 152 +++++ .../graph/LabelPropagationRunner.scala | 115 ++++ .../com/bigdata/graph/LouvainHiveRunner.scala | 87 +++ .../com/bigdata/graph/LouvainRunner.scala | 116 ++++ .../scala/com/bigdata/graph/MSSPRunner.scala | 113 ++++ .../MaximalCliqueEnumerationHiveRunner.scala | 84 +++ .../MaximalCliqueEnumerationRunner.scala | 91 +++ .../com/bigdata/graph/ModularityRunner.scala | 108 ++++ .../com/bigdata/graph/Node2VecRunner.scala | 130 ++++ .../bigdata/graph/PageRankHiveRunner.scala | 84 +++ .../com/bigdata/graph/PageRankRunner.scala | 130 ++++ .../graph/PersonalizedPageRankRunner.scala | 132 ++++ .../StronglyConnectedComponentsRunner.scala | 96 +++ .../graph/SubgraphMatchingRunner.scala | 112 ++++ .../bigdata/graph/TrangleCountRunner.scala | 110 ++++ .../graph/TrillionPageRankRunner.scala | 181 ++++++ .../com/bigdata/graph/TrustRankRunner.scala | 130 ++++ .../main/scala/com/bigdata/graph/Util.scala | 363 +++++++++++ .../com/bigdata/graph/WCEHiveRunner.scala | 78 +++ .../scala/com/bigdata/graph/WCERunner.scala | 105 ++++ .../WeightedLablePropagationRunner.scala | 102 +++ .../graph/WeightedPageRankRunner.scala | 129 ++++ .../main/scala/com/bigdata/ml/ALSRunner.scala | 301 +++++++++ .../main/scala/com/bigdata/ml/BORunner.scala | 369 +++++++++++ .../main/scala/com/bigdata/ml/CovRunner.scala | 128 ++++ .../main/scala/com/bigdata/ml/DTBRunner.scala | 295 +++++++++ .../main/scala/com/bigdata/ml/DTRunner.scala | 422 +++++++++++++ .../scala/com/bigdata/ml/EncoderRunner.scala | 162 +++++ .../main/scala/com/bigdata/ml/FMRunner.scala | 252 ++++++++ .../main/scala/com/bigdata/ml/FPGRunner.scala | 184 ++++++ .../scala/com/bigdata/ml/GBDTRunner.scala | 326 ++++++++++ .../main/scala/com/bigdata/ml/HDBRunner.scala | 170 +++++ .../main/scala/com/bigdata/ml/IDFRunner.scala | 141 +++++ .../main/scala/com/bigdata/ml/IFRunner.scala | 205 ++++++ .../scala/com/bigdata/ml/KMeansRunner.scala | 200 ++++++ .../main/scala/com/bigdata/ml/KNNRunner.scala | 244 ++++++++ .../main/scala/com/bigdata/ml/LDARunner.scala | 282 +++++++++ .../scala/com/bigdata/ml/LightGBMRunner.scala | 283 +++++++++ .../scala/com/bigdata/ml/LinRRunner.scala | 236 +++++++ .../scala/com/bigdata/ml/LogRRunner.scala | 221 +++++++ .../main/scala/com/bigdata/ml/NMFRunner.scala | 225 +++++++ .../main/scala/com/bigdata/ml/PCARunner.scala | 232 +++++++ .../scala/com/bigdata/ml/PearsonRunner.scala | 183 ++++++ .../com/bigdata/ml/PrefixSpanRunner.scala | 190 ++++++ .../main/scala/com/bigdata/ml/RFRunner.scala | 405 ++++++++++++ .../scala/com/bigdata/ml/SPCARunner.scala | 205 ++++++ .../main/scala/com/bigdata/ml/SVDRunner.scala | 174 ++++++ .../main/scala/com/bigdata/ml/SVMRunner.scala | 187 ++++++ .../scala/com/bigdata/ml/SimRankRunner.scala | 154 +++++ .../scala/com/bigdata/ml/SpearManRunner.scala | 190 ++++++ .../main/scala/com/bigdata/ml/TERunner.scala | 339 ++++++++++ .../scala/com/bigdata/ml/Word2VecRunner.scala | 185 ++++++ .../scala/com/bigdata/ml/XGBTRunner.scala | 229 +++++++ .../graph/IncDataGeneratorBatch.scala | 115 ++++ .../graph/TrillionPageRankDataProcess.scala | 18 + .../preprocess/ml/EncoderDataGenRun.scala | 184 ++++++ .../com/bigdata/utils/DTBucketUtils.scala | 35 ++ .../scala/com/bigdata/utils/TimeUtils.scala | 48 ++ .../main/scala/com/bigdata/utils/Utils.scala | 156 +++++ .../automl/AngelBayesianOptimization.scala | 377 +++++++++++ .../spark/automl/AngelOptCrossValidator.scala | 235 +++++++ .../angle/spark/automl/tuner/TunerParam.scala | 48 ++ .../tuner/acquisition/Acquisition.scala | 38 ++ .../spark/automl/tuner/acquisition/EI.scala | 65 ++ .../spark/automl/tuner/acquisition/UCB.scala | 63 ++ .../acquisition/optimizer/AcqOptimizer.scala | 43 ++ .../acquisition/optimizer/LocalSearch.scala | 49 ++ .../acquisition/optimizer/RandomSearch.scala | 69 ++ .../automl/tuner/config/Configuration.scala | 72 +++ .../tuner/config/ConfigurationSpace.scala | 263 ++++++++ .../automl/tuner/config/EarlyStopping.scala | 59 ++ .../automl/tuner/kernel/Covariance.scala | 52 ++ .../automl/tuner/kernel/CovarianceType.scala | 42 ++ .../spark/automl/tuner/kernel/Matern3.scala | 91 +++ .../spark/automl/tuner/kernel/Matern5.scala | 90 +++ .../automl/tuner/kernel/Matern5Iso.scala | 93 +++ .../automl/tuner/kernel/SquareExpIso.scala | 85 +++ .../spark/automl/tuner/math/BreezeOp.scala | 86 +++ .../spark/automl/tuner/math/SquareDist.scala | 47 ++ .../spark/automl/tuner/model/GPExample.scala | 67 ++ .../automl/tuner/model/GPKernelDiffFunc.scala | 84 +++ .../spark/automl/tuner/model/GPModel.scala | 177 ++++++ .../tuner/parameter/ContinuousSpace.scala | 152 +++++ .../tuner/parameter/DiscreteSpace.scala | 138 ++++ .../automl/tuner/parameter/ParamParser.scala | 114 ++++ .../automl/tuner/parameter/ParamSpace.scala | 77 +++ .../spark/automl/tuner/solver/Solver.scala | 201 ++++++ .../automl/tuner/solver/SolverWithTrail.scala | 46 ++ .../automl/tuner/surrogate/GPSurrogate.scala | 86 +++ .../tuner/surrogate/NormalSurrogate.scala | 46 ++ .../automl/tuner/surrogate/RFSurrogate.scala | 96 +++ .../automl/tuner/surrogate/Surrogate.scala | 140 +++++ .../tuner/surrogate/SurrogateMode.scala | 33 + .../spark/automl/tuner/trail/TestRunner.scala | 34 + .../spark/automl/tuner/trail/TestTrail.scala | 35 ++ .../spark/automl/tuner/trail/Trail.scala | 29 + .../automl/tuner/trail/TrailRunner.scala | 32 + .../angle/spark/automl/utils/ArgsUtil.scala | 41 ++ .../spark/automl/utils/AutoMLException.scala | 19 + .../angle/spark/automl/utils/DataUtils.scala | 63 ++ .../spark/automl/utils/Distribution.scala | 30 + .../ml/classification/KNNClassifier.scala | 240 +++++++ .../spark/ml/clustering/DBSCANRunner.scala | 128 ++++ .../ml/feature/FeatureEncodingOrigin.scala | 165 +++++ .../scala/org/apache/spark/ml/knn/KNN.scala | 588 ++++++++++++++++++ .../org/apache/spark/ml/knn/MetricTree.scala | 397 ++++++++++++ .../ml/recommendation/SimRankOpenSource.scala | 151 +++++ .../spark/ml/regression/KNNRegression.scala | 156 +++++ .../apache/spark/ml/tuning/BaseRange.scala | 77 +++ .../apache/spark/ml/tuning/ParamSpace.scala | 100 +++ .../apache/spark/ml/tuning/ParamType.scala | 94 +++ .../org/apache/spark/mllib/knn/KNNUtils.scala | 20 + .../mllib/tree/DTBucketModelHelper.scala | 17 + .../apache/spark/util/PublicThreadUtils.scala | 6 + 370 files changed, 37738 insertions(+) create mode 100644 tools/kal-test/README.md create mode 100644 tools/kal-test/bin/compare/graph/3rd_para_compare.sh create mode 100644 tools/kal-test/bin/compare/graph/deepwalk_compare.sh create mode 100644 tools/kal-test/bin/compare/graph/incpr_compare.sh create mode 100644 tools/kal-test/bin/compare/graph/lpa_compare.sh create mode 100644 tools/kal-test/bin/compare/graph/node2vec_compare.sh create mode 100644 tools/kal-test/bin/compare/graph/ppr_compare.sh create mode 100644 tools/kal-test/bin/compare/ml/idf_compare.sh create mode 100644 tools/kal-test/bin/compare/ml/knn_compare.sh create mode 100644 tools/kal-test/bin/compare/ml/major_compare.sh create mode 100644 tools/kal-test/bin/compare/ml/simrank_compare.sh create mode 100644 tools/kal-test/bin/graph/betweenness_run.sh create mode 100644 tools/kal-test/bin/graph/bfs_run.sh create mode 100644 tools/kal-test/bin/graph/cc_run.sh create mode 100644 tools/kal-test/bin/graph/cd_run.sh create mode 100644 tools/kal-test/bin/graph/closeness_run.sh create mode 100644 tools/kal-test/bin/graph/closeness_run_hive.sh create mode 100644 tools/kal-test/bin/graph/clusteringcoefficient_run.sh create mode 100644 tools/kal-test/bin/graph/deepwalk_run.sh create mode 100644 tools/kal-test/bin/graph/degree_run.sh create mode 100644 tools/kal-test/bin/graph/fraudar_run.sh create mode 100644 tools/kal-test/bin/graph/inccc_run.sh create mode 100644 tools/kal-test/bin/graph/incpr_run.sh create mode 100644 tools/kal-test/bin/graph/katz_run.sh create mode 100644 tools/kal-test/bin/graph/kcore_run.sh create mode 100644 tools/kal-test/bin/graph/kcore_run_hive.sh create mode 100644 tools/kal-test/bin/graph/louvain_run.sh create mode 100644 tools/kal-test/bin/graph/louvain_run_hive.sh create mode 100644 tools/kal-test/bin/graph/lpa_run.sh create mode 100644 tools/kal-test/bin/graph/mce_run.sh create mode 100644 tools/kal-test/bin/graph/mce_run_hive.sh create mode 100644 tools/kal-test/bin/graph/modularity_run.sh create mode 100644 tools/kal-test/bin/graph/mssp_run.sh create mode 100644 tools/kal-test/bin/graph/node2vec_run.sh create mode 100644 tools/kal-test/bin/graph/ppr_run.sh create mode 100644 tools/kal-test/bin/graph/pr_run.sh create mode 100644 tools/kal-test/bin/graph/pr_run_hive.sh create mode 100644 tools/kal-test/bin/graph/scc_run.sh create mode 100644 tools/kal-test/bin/graph/sgm_run.sh create mode 100644 tools/kal-test/bin/graph/tc_run.sh create mode 100644 tools/kal-test/bin/graph/tpr_run.sh create mode 100644 tools/kal-test/bin/graph/tr_run.sh create mode 100644 tools/kal-test/bin/graph/wce_run.sh create mode 100644 tools/kal-test/bin/graph/wce_run_hive.sh create mode 100644 tools/kal-test/bin/graph/wlpa_run.sh create mode 100644 tools/kal-test/bin/graph/wpr_run.sh create mode 100644 tools/kal-test/bin/graph_workflow.sh create mode 100644 tools/kal-test/bin/ml/als_run.sh create mode 100644 tools/kal-test/bin/ml/bo_run.sh create mode 100644 tools/kal-test/bin/ml/cov_run.sh create mode 100644 tools/kal-test/bin/ml/dbscan_run.sh create mode 100644 tools/kal-test/bin/ml/dt_run.sh create mode 100644 tools/kal-test/bin/ml/dtb_run.sh create mode 100644 tools/kal-test/bin/ml/encoder_run.sh create mode 100644 tools/kal-test/bin/ml/fm_run.sh create mode 100644 tools/kal-test/bin/ml/fpg_run.sh create mode 100644 tools/kal-test/bin/ml/gbdt_run.sh create mode 100644 tools/kal-test/bin/ml/hdb_run.sh create mode 100644 tools/kal-test/bin/ml/idf_run.sh create mode 100644 tools/kal-test/bin/ml/if_run.sh create mode 100644 tools/kal-test/bin/ml/kmeans_run.sh create mode 100644 tools/kal-test/bin/ml/knn_run.sh create mode 100644 tools/kal-test/bin/ml/lda_run.sh create mode 100644 tools/kal-test/bin/ml/lgbm_run.sh create mode 100644 tools/kal-test/bin/ml/linR_run.sh create mode 100644 tools/kal-test/bin/ml/logR_run.sh create mode 100644 tools/kal-test/bin/ml/nmf_run.sh create mode 100644 tools/kal-test/bin/ml/pca_run.sh create mode 100644 tools/kal-test/bin/ml/pearson_run.sh create mode 100644 tools/kal-test/bin/ml/ps_run.sh create mode 100644 tools/kal-test/bin/ml/rf_run.sh create mode 100644 tools/kal-test/bin/ml/simrank_run.sh create mode 100644 tools/kal-test/bin/ml/spca_run.sh create mode 100644 tools/kal-test/bin/ml/spearman_run.sh create mode 100644 tools/kal-test/bin/ml/svd_run.sh create mode 100644 tools/kal-test/bin/ml/svm_run.sh create mode 100644 tools/kal-test/bin/ml/te_run.sh create mode 100644 tools/kal-test/bin/ml/word2vec_run.sh create mode 100644 tools/kal-test/bin/ml/xgbt_run.sh create mode 100644 tools/kal-test/bin/ml_workflow.sh create mode 100644 tools/kal-test/bin/preprocess/graph/incpr_data_process.sh create mode 100644 tools/kal-test/bin/preprocess/graph/tpr_data_process.sh create mode 100644 tools/kal-test/bin/preprocess/ml/encoder_data_gen.sh create mode 100644 tools/kal-test/conf/graph/betweenness/betweenness.yml create mode 100644 tools/kal-test/conf/graph/betweenness/betweenness_spark.properties create mode 100644 tools/kal-test/conf/graph/bfs/bfs.yml create mode 100644 tools/kal-test/conf/graph/bfs/bfs_source_id.properties create mode 100644 tools/kal-test/conf/graph/bfs/bfs_spark.properties create mode 100644 tools/kal-test/conf/graph/cc/cc.yml create mode 100644 tools/kal-test/conf/graph/cc/cc_spark.properties create mode 100644 tools/kal-test/conf/graph/cd/cd.yml create mode 100644 tools/kal-test/conf/graph/cd/cd_spark.properties create mode 100644 tools/kal-test/conf/graph/closeness/closeness_spark.properties create mode 100644 tools/kal-test/conf/graph/clusteringcoefficient/clusteringcoefficient.yml create mode 100644 tools/kal-test/conf/graph/clusteringcoefficient/clusteringcoefficient_spark.properties create mode 100644 tools/kal-test/conf/graph/deepwalk/deepwalk.yml create mode 100644 tools/kal-test/conf/graph/deepwalk/deepwalk_spark.properties create mode 100644 tools/kal-test/conf/graph/degree/degree.yml create mode 100644 tools/kal-test/conf/graph/degree/degree_spark.properties create mode 100644 tools/kal-test/conf/graph/fraudar/fraudar.yml create mode 100644 tools/kal-test/conf/graph/fraudar/fraudar_spark.properties create mode 100644 tools/kal-test/conf/graph/graph_datasets.properties create mode 100644 tools/kal-test/conf/graph/inccc/inccc.yml create mode 100644 tools/kal-test/conf/graph/inccc/inccc_spark.properties create mode 100644 tools/kal-test/conf/graph/incpr/incpr.yml create mode 100644 tools/kal-test/conf/graph/incpr/incpr_spark.properties create mode 100644 tools/kal-test/conf/graph/katz/katz.yml create mode 100644 tools/kal-test/conf/graph/katz/katz_spark.properties create mode 100644 tools/kal-test/conf/graph/kcore/kcore.yml create mode 100644 tools/kal-test/conf/graph/kcore/kcore_spark.properties create mode 100644 tools/kal-test/conf/graph/louvain/louvain.yml create mode 100644 tools/kal-test/conf/graph/louvain/louvain_spark.properties create mode 100644 tools/kal-test/conf/graph/lpa/lpa.yml create mode 100644 tools/kal-test/conf/graph/lpa/lpa_spark.properties create mode 100644 tools/kal-test/conf/graph/mce/mce.yml create mode 100644 tools/kal-test/conf/graph/mce/mce_spark.properties create mode 100644 tools/kal-test/conf/graph/modularity/modularity.yml create mode 100644 tools/kal-test/conf/graph/modularity/modularity_spark.properties create mode 100644 tools/kal-test/conf/graph/mssp/mssp_spark.properties create mode 100644 tools/kal-test/conf/graph/node2vec/node2vec.yml create mode 100644 tools/kal-test/conf/graph/node2vec/node2vec_spark.properties create mode 100644 tools/kal-test/conf/graph/ppr/ppr.yml create mode 100644 tools/kal-test/conf/graph/ppr/ppr_source_id.properties create mode 100644 tools/kal-test/conf/graph/ppr/ppr_spark.properties create mode 100644 tools/kal-test/conf/graph/pr/pr.yml create mode 100644 tools/kal-test/conf/graph/pr/pr_spark.properties create mode 100644 tools/kal-test/conf/graph/scc/scc.yml create mode 100644 tools/kal-test/conf/graph/scc/scc_spark.properties create mode 100644 tools/kal-test/conf/graph/sgm/sgm.yml create mode 100644 tools/kal-test/conf/graph/sgm/sgm_spark.properties create mode 100644 tools/kal-test/conf/graph/tc/tc.yml create mode 100644 tools/kal-test/conf/graph/tc/tc_spark.properties create mode 100644 tools/kal-test/conf/graph/tpr/tpr.yml create mode 100644 tools/kal-test/conf/graph/tpr/tpr_spark.properties create mode 100644 tools/kal-test/conf/graph/tr/tr.yml create mode 100644 tools/kal-test/conf/graph/tr/tr_spark.properties create mode 100644 tools/kal-test/conf/graph/wce/wce.yml create mode 100644 tools/kal-test/conf/graph/wce/wce_spark.properties create mode 100644 tools/kal-test/conf/graph/wlpa/wlpa.yml create mode 100644 tools/kal-test/conf/graph/wlpa/wlpa_spark.properties create mode 100644 tools/kal-test/conf/graph/wpr/wpr.yml create mode 100644 tools/kal-test/conf/graph/wpr/wpr_spark.properties create mode 100644 tools/kal-test/conf/ml/als/als.yml create mode 100644 tools/kal-test/conf/ml/als/als_spark.properties create mode 100644 tools/kal-test/conf/ml/bo/bo.yml create mode 100644 tools/kal-test/conf/ml/bo/bo_spark.properties create mode 100644 tools/kal-test/conf/ml/cov/cov.yml create mode 100644 tools/kal-test/conf/ml/cov/cov_spark.properties create mode 100644 tools/kal-test/conf/ml/dbscan/dbscan.yml create mode 100644 tools/kal-test/conf/ml/dbscan/dbscan_spark.properties create mode 100644 tools/kal-test/conf/ml/dt/dt.yml create mode 100644 tools/kal-test/conf/ml/dt/dt_spark.properties create mode 100644 tools/kal-test/conf/ml/dtb/dtb.yml create mode 100644 tools/kal-test/conf/ml/dtb/dtb_spark.properties create mode 100644 tools/kal-test/conf/ml/encoder/encoder.yml create mode 100644 tools/kal-test/conf/ml/encoder/encoder_spark.properties create mode 100644 tools/kal-test/conf/ml/fm/fm.yml create mode 100644 tools/kal-test/conf/ml/fm/fm_spark.properties create mode 100644 tools/kal-test/conf/ml/fpg/fpg.yml create mode 100644 tools/kal-test/conf/ml/fpg/fpg_spark.properties create mode 100644 tools/kal-test/conf/ml/gbdt/gbdt.yml create mode 100644 tools/kal-test/conf/ml/gbdt/gbdt_spark.properties create mode 100644 tools/kal-test/conf/ml/hdb/hdb.yml create mode 100644 tools/kal-test/conf/ml/hdb/hdb_spark.properties create mode 100644 tools/kal-test/conf/ml/idf/idf.yml create mode 100644 tools/kal-test/conf/ml/idf/idf_spark.properties create mode 100644 tools/kal-test/conf/ml/if/if.yml create mode 100644 tools/kal-test/conf/ml/if/if_spark.properties create mode 100644 tools/kal-test/conf/ml/kmeans/kmeans.yml create mode 100644 tools/kal-test/conf/ml/kmeans/kmeans_spark.properties create mode 100644 tools/kal-test/conf/ml/knn/knn.yml create mode 100644 tools/kal-test/conf/ml/knn/knn_spark.properties create mode 100644 tools/kal-test/conf/ml/lda/lda.yml create mode 100644 tools/kal-test/conf/ml/lda/lda_spark.properties create mode 100644 tools/kal-test/conf/ml/lgbm/lgbm.yml create mode 100644 tools/kal-test/conf/ml/lgbm/lgbm_spark.properties create mode 100644 tools/kal-test/conf/ml/linR/linR.yml create mode 100644 tools/kal-test/conf/ml/linR/linR_spark.properties create mode 100644 tools/kal-test/conf/ml/logR/logR.yml create mode 100644 tools/kal-test/conf/ml/logR/logR_spark.properties create mode 100644 tools/kal-test/conf/ml/ml_datasets.properties create mode 100644 tools/kal-test/conf/ml/nmf/nmf.yml create mode 100644 tools/kal-test/conf/ml/nmf/nmf_spark.properties create mode 100644 tools/kal-test/conf/ml/pca/pca.yml create mode 100644 tools/kal-test/conf/ml/pca/pca_spark.properties create mode 100644 tools/kal-test/conf/ml/pearson/pearson.yml create mode 100644 tools/kal-test/conf/ml/pearson/pearson_spark.properties create mode 100644 tools/kal-test/conf/ml/ps/ps.yml create mode 100644 tools/kal-test/conf/ml/ps/ps_spark.properties create mode 100644 tools/kal-test/conf/ml/rf/rf.yml create mode 100644 tools/kal-test/conf/ml/rf/rf_spark.properties create mode 100644 tools/kal-test/conf/ml/simrank/simrank.yml create mode 100644 tools/kal-test/conf/ml/simrank/simrank_spark.properties create mode 100644 tools/kal-test/conf/ml/spca/spca.yml create mode 100644 tools/kal-test/conf/ml/spca/spca_spark.properties create mode 100644 tools/kal-test/conf/ml/spearman/spearman.yml create mode 100644 tools/kal-test/conf/ml/spearman/spearman_spark.properties create mode 100644 tools/kal-test/conf/ml/svd/svd.yml create mode 100644 tools/kal-test/conf/ml/svd/svd_spark.properties create mode 100644 tools/kal-test/conf/ml/svm/svm.yml create mode 100644 tools/kal-test/conf/ml/svm/svm_spark.properties create mode 100644 tools/kal-test/conf/ml/te/te.yml create mode 100644 tools/kal-test/conf/ml/te/te_spark.properties create mode 100644 tools/kal-test/conf/ml/word2vec/word2vec.yml create mode 100644 tools/kal-test/conf/ml/word2vec/word2vec_spark.properties create mode 100644 tools/kal-test/conf/ml/xgbt/xgbt.yml create mode 100644 tools/kal-test/conf/ml/xgbt/xgbt_spark.properties create mode 100644 tools/kal-test/pom.xml create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/graph/BFSVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/graph/BetweennessClosenessVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/graph/CCVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/graph/CDDegreeVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/graph/ClusteringCoefficientTCVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/graph/DeepWalkVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/graph/IncPageRankVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/graph/KCoreVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/graph/LpaVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/graph/MceWceVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/graph/MsspVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/graph/Node2vecVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/graph/PageRankVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/graph/PersonalizedPageRankVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/graph/SCCVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/graph/TrillionPageRankVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/graph/TrustRankVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/graph/WeightedPageRankVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/ml/DTBVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/ml/DownEvaluationVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/ml/EncoderVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/ml/EvaluationVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/ml/FPGVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/ml/IDFVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/ml/KNNVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/ml/LDAVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/ml/MatrixVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/ml/PrefixSpanVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/ml/SVDVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/ml/SimRankVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/ml/TEVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/ml/UpEvaluationVerify.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/ml/Word2VecEvaluation.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/BFSRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/BetweennessRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/ClosenessHiveRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/ClosenessRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/ClusteringCoefficientRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/ConnectedComponentsRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/CycleDetectionWithConstrainsRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/DeepWalkRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/DegreeRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/FraudarRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/IncConnectedComponentsRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/IncPageRankRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/KCoreDecompositionHiveRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/KCoreDecompositionRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/KatzCentrality.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/KcoreMain.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/LabelPropagationRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/LouvainHiveRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/LouvainRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/MSSPRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/MaximalCliqueEnumerationHiveRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/MaximalCliqueEnumerationRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/ModularityRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/Node2VecRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/PageRankHiveRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/PageRankRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/PersonalizedPageRankRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/StronglyConnectedComponentsRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/SubgraphMatchingRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/TrangleCountRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/TrillionPageRankRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/TrustRankRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/Util.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/WCEHiveRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/WCERunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/WeightedLablePropagationRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/graph/WeightedPageRankRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/ALSRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/BORunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/CovRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/DTBRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/DTRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/EncoderRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/FMRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/FPGRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/GBDTRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/HDBRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/IDFRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/IFRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/KMeansRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/KNNRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/LDARunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/LightGBMRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/LinRRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/LogRRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/NMFRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/PCARunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/PearsonRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/PrefixSpanRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/RFRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/SPCARunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/SVDRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/SVMRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/SimRankRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/SpearManRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/TERunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/Word2VecRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/ml/XGBTRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/preprocess/graph/IncDataGeneratorBatch.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/preprocess/graph/TrillionPageRankDataProcess.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/preprocess/ml/EncoderDataGenRun.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/utils/DTBucketUtils.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/utils/TimeUtils.scala create mode 100644 tools/kal-test/src/main/scala/com/bigdata/utils/Utils.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/AngelBayesianOptimization.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/AngelOptCrossValidator.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/TunerParam.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/Acquisition.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/EI.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/UCB.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/optimizer/AcqOptimizer.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/optimizer/LocalSearch.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/optimizer/RandomSearch.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/config/Configuration.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/config/ConfigurationSpace.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/config/EarlyStopping.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/Covariance.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/CovarianceType.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/Matern3.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/Matern5.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/Matern5Iso.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/SquareExpIso.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/math/BreezeOp.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/math/SquareDist.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/model/GPExample.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/model/GPKernelDiffFunc.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/model/GPModel.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/parameter/ContinuousSpace.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/parameter/DiscreteSpace.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/parameter/ParamParser.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/parameter/ParamSpace.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/solver/Solver.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/solver/SolverWithTrail.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/GPSurrogate.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/NormalSurrogate.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/RFSurrogate.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/Surrogate.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/SurrogateMode.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/trail/TestRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/trail/TestTrail.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/trail/Trail.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/trail/TrailRunner.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/utils/ArgsUtil.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/utils/AutoMLException.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/utils/DataUtils.scala create mode 100644 tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/utils/Distribution.scala create mode 100644 tools/kal-test/src/main/scala/org/apache/spark/ml/classification/KNNClassifier.scala create mode 100644 tools/kal-test/src/main/scala/org/apache/spark/ml/clustering/DBSCANRunner.scala create mode 100644 tools/kal-test/src/main/scala/org/apache/spark/ml/feature/FeatureEncodingOrigin.scala create mode 100644 tools/kal-test/src/main/scala/org/apache/spark/ml/knn/KNN.scala create mode 100644 tools/kal-test/src/main/scala/org/apache/spark/ml/knn/MetricTree.scala create mode 100644 tools/kal-test/src/main/scala/org/apache/spark/ml/recommendation/SimRankOpenSource.scala create mode 100644 tools/kal-test/src/main/scala/org/apache/spark/ml/regression/KNNRegression.scala create mode 100644 tools/kal-test/src/main/scala/org/apache/spark/ml/tuning/BaseRange.scala create mode 100644 tools/kal-test/src/main/scala/org/apache/spark/ml/tuning/ParamSpace.scala create mode 100644 tools/kal-test/src/main/scala/org/apache/spark/ml/tuning/ParamType.scala create mode 100644 tools/kal-test/src/main/scala/org/apache/spark/mllib/knn/KNNUtils.scala create mode 100644 tools/kal-test/src/main/scala/org/apache/spark/mllib/tree/DTBucketModelHelper.scala create mode 100644 tools/kal-test/src/main/scala/org/apache/spark/util/PublicThreadUtils.scala diff --git a/tools/kal-test/README.md b/tools/kal-test/README.md new file mode 100644 index 0000000..83ff55b --- /dev/null +++ b/tools/kal-test/README.md @@ -0,0 +1,51 @@ +# kal-test + + +### Description +The Kunpeng algorithm library test tool can be used to test machine learning and graph analysis algorithms. + + +### Compilation Tutorial + +#### Prerequisites +1. The Maven compilation environment has been configured. +2. The algorithm software package has been obtained. +#### Procedure +1. Go to the Spark-ml-algo-lib/tools/kal-test directory in the compilation environment. +2. Install the dependencies.
+ Take spark 2.3.2 as an example, the install command is as follows:
+ mvn install:install-file -DgroupId=org.apache.spark -DartifactId=boostkit-graph-kernel-client_2.11 -Dversion=2.2.0 -Dclassifier=spark2.3.2 -Dfile=boostkit-graph-kernel-client_2.11-2.2.0-spark2.3.2.jar -Dpackaging=jar -DgeneratePom=true
+ mvn install:install-file -DgroupId=org.apache.spark -DartifactId=boostkit-ml-kernel-client_2.11 -Dversion=2.2.0 -Dclassifier=spark2.3.2 -Dfile=boostkit-ml-kernel-client_2.11-2.2.0-spark2.3.2.jar -Dpackaging=jar -DgeneratePom=true +3. Run the compile command:
+ mvn clean install +4. View the kal-test_2.11-0.1.jar file generated in Spark-ml-algo-lib/tools/kal-test/target. + +### Deployment and Usage Description + +1. Deploy the kal-test folder in the test environment, for example, in the /home/test/boostkit/ directory. If the directory does not exist, create one.
mkdir -p /home/test/boostkit/ +2. Go to the directory.
+ cd /home/test/boostkit/kal-test/ +3. Save the obtained boostkit-graph-kernel-scala_version-kal_version-spark_version-aarch64.jar, boostkit-graph-acc_scala_version-kal_version-spark_version.jar, and boostkit-graph-core_scala_version-kal_version-spark_version.jar files to /home/test/boostkit/kal-test/lib. +4. Go to the /home/test/boostkit/kal-test directory.
+ cd /home/test/boostkit/kal-test +5. Run the following command in the /home/test/boostkit/kal-test/ directory (taking the PageRank algorithm as an example):
bash bin/graph/pr_run.sh uk_2002 run no +6. Check the algorithm running status. + +### Algorithm and Dataset + + +| Algorithm | Dataset | Interface | +| :-----| ----: | :----: | +| PageRank | uk_2002 | run | + + +### References + +1. Open source KNN: https://github.com/saurfang/spark-knn.git +2. Open source BFS: https://github.com/prasad223/GraphxBFS +3. Open source DBSCAN: https://github.com/alitouka/spark_dbscan +4. Open source ClusteringCoefficient: https://github.com/apache/spark/pull/9150/files +5. Open source Betweenness: https://github.com/Sotera/distributed-graph-analytics/tree/master/dga-graphx/src/main/scala/com/soteradefense/dga/graphx/hbse +6. Open source Node2Vec: https://github.com/QuanLab/node2vec-spark +7. Open source SubgraphMatching: https://datalab.snu.ac.kr/pegasusn/scala-apidoc/#pegasus.spark.subgraph.PSE +8. Open source XGBoost: https://github.com/dmlc/xgboost/tree/v1.1.0 diff --git a/tools/kal-test/bin/compare/graph/3rd_para_compare.sh b/tools/kal-test/bin/compare/graph/3rd_para_compare.sh new file mode 100644 index 0000000..7fd4ead --- /dev/null +++ b/tools/kal-test/bin/compare/graph/3rd_para_compare.sh @@ -0,0 +1,70 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: algorithm name: betweenness, bfs, closeness, clusteringcoefficient, cc, cd, degree, kcore, mce, mssp, pr, scc, tc, tpr, tr, wce, wpr" + echo "2nd argument: path of baseline result" + echo "3rd argument: path of algorithm result" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 3 ];then + alg_usage + exit 0 +fi + +alg=$1 +path0=$2 +path1=$3 + +if [ $alg == "betweenness" ] || [ $alg == "closeness" ]; then + class_name=com.bigdata.compare.graph.BetweennessClosenessVerify +elif [ $alg == "bfs" ]; then + class_name=com.bigdata.compare.graph.BFSVerify +elif [ $alg == "clusteringcoefficient" ] || [ $alg == "tc" ]; then + class_name=com.bigdata.compare.graph.ClusteringCoefficientTCVerify +elif [ $alg == "cc" ]; then + class_name=com.bigdata.compare.graph.CCVerify +elif [ $alg == "cd" ] || [ $alg == "degree" ]; then + class_name=com.bigdata.compare.graph.CDDegreeVerify +elif [ $alg == "kcore" ]; then + class_name=com.bigdata.compare.graph.KCoreVerify +elif [ $alg == "mce" ] || [ $alg == "wce" ]; then + class_name=com.bigdata.compare.graph.MceWceVerify +elif [ $alg == "mssp" ]; then + class_name=com.bigdata.compare.graph.MsspVerify +elif [ $alg == "pr" ]; then + class_name=com.bigdata.compare.graph.PageRankVerify +elif [ $alg == "scc" ]; then + class_name=com.bigdata.compare.graph.SCCVerify +elif [ $alg == "tpr" ]; then + class_name=com.bigdata.compare.graph.TrillionPageRankVerify +elif [ $alg == "tr" ]; then + class_name=com.bigdata.compare.graph.TrustRankVerify +elif [ $alg == "wpr" ]; then + class_name=com.bigdata.compare.graph.WeightedPageRankVerify +else + alg_usage + exit 0 +fi + +source conf/graph/graph_datasets.properties +scala_version=scalaVersion +scala_version_val=${!scala_version} + +spark-submit \ +--class ${class_name} \ +--master yarn \ +--num-executors 29 \ +--executor-memory 35g \ +--executor-cores 8 \ +--driver-memory 50g \ +--conf "spark.driver.maxResultSize=100g" \ +./lib/kal-test_${scala_version_val}-0.1.jar ${path0} ${path1} \ No newline at end of file diff --git a/tools/kal-test/bin/compare/graph/deepwalk_compare.sh b/tools/kal-test/bin/compare/graph/deepwalk_compare.sh new file mode 100644 index 0000000..8f80ad3 --- /dev/null +++ b/tools/kal-test/bin/compare/graph/deepwalk_compare.sh @@ -0,0 +1,70 @@ +#!/bin/bash +set -e + + +case "$1" in +-h | --help | ?) + echo "Usage: " + echo "1st argument: name of dataset: cit_patents_deepwalk" + echo "2nd argument: optimization algorithm or raw: no/yes" + exit 0 + ;; +esac + +if [ $# -ne 2 ]; then + echo "please input 2 arguments: " + echo "1st argument: name of dataset: cit_patents_deepwalk" + echo "2nd argument: optimization algorithm or raw: no/yes" + exit 0 +fi + +source conf/graph/deepwalk/deepwalk_spark.properties + +dataset_name=$1 +is_raw=$2 + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +data_path=${dataset_name} +data_path_val=${!data_path} +echo "${dataset_name} : ${data_path_val}" + +negEdge_data_path="${dataset_name}_negEdge" +negEdge_data_path_val=${!negEdge_data_path} +echo "${dataset_name} negEdge: ${negEdge_data_path_val}" + +model_conf=${dataset_name}-${cpu_name} +model_Path="/tmp/graph/result/deepawlk/${dataset_name}/${is_raw}" + + +spark-submit \ +--class com.bigdata.compare.graph.DeepWalkVerify \ +--master yarn \ +--num-executors 29 \ +--executor-memory 35g \ +--executor-cores 8 \ +--driver-memory 300g \ +--conf spark.kryoserializer.buffer.max=2047m \ +--conf spark.ui.showConsoleProgress=true \ +--conf spark.driver.maxResultSize=0 \ +--conf spark.driver.extraJavaOptions="-Xms300G -XX:hashCode=0" \ +--conf spark.executor.extraJavaOptions="-Xms35G -XX:hashCode=0" \ +--conf spark.rpc.askTimeout=1000000s \ +--conf spark.network.timeout=1000000s \ +--conf spark.executor.heartbeatInterval=100000s \ +--conf spark.rpc.message.maxSize=1000 \ +--jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--conf "spark.executor.extraClassPath=/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${negEdge_data_path_val} ${model_Path} ${is_raw} + + + diff --git a/tools/kal-test/bin/compare/graph/incpr_compare.sh b/tools/kal-test/bin/compare/graph/incpr_compare.sh new file mode 100644 index 0000000..5d837dd --- /dev/null +++ b/tools/kal-test/bin/compare/graph/incpr_compare.sh @@ -0,0 +1,51 @@ +#!/bin/bash +set -e + +opt_path=/tmp/graph/result/incpr/no +raw_path=/tmp/graph/result/incpr/yes + +case "$1" in +-h | --help | ?) + echo "Usage: " + echo "1st argument: path of incpr result: default value [${opt_path}]" + echo "2nd argument: path of tpr result: default value [${raw_path}]" + exit 0 + ;; +esac + +path0=$1 +path1=$2 + +path0=${path0:-${opt_path}} +path1=${path1:-${raw_path}} + +source conf/graph/graph_datasets.properties +scala_version=scalaVersion +scala_version_val=${!scala_version} + +split="\t" +numPart=273 +output_path=${output_path_prefix}/incpr/acc +dataset_name=twitter_2010 + +for rate in "0.001" "0.01" "0.05" +do + for batch in "1" "2" "3" "4" "5" + do + hdfs dfs -rm -r -f "${output_path}" + incpr_path=${path0}/${dataset_name}_${rate}_batch_${batch} + tpr_path=${path1}/${dataset_name}_${rate}_batch_${batch} + datasetPath=${!dataset_name}_${rate}_batch_${batch} + echo ">>> start twitter-2010_${rate}_batch_${batch} accuracy evaluation" + spark-submit \ + --class com.bigdata.compare.graph.IncPageRankVerify \ + --master yarn \ + --name incpr_${rate}_batch_${batch} \ + --num-executors 29 \ + --executor-memory 35g \ + --executor-cores 8 \ + --driver-memory 50g \ + ./lib/kal-test_${scala_version_val}-0.1.jar yarn ${incpr_path} ${tpr_path} ${split} ${numPart} ${output_path} ${datasetPath} + echo ">>> end twitter-2010_${rate}_batch_${batch} accuracy evaluation" + done +done \ No newline at end of file diff --git a/tools/kal-test/bin/compare/graph/lpa_compare.sh b/tools/kal-test/bin/compare/graph/lpa_compare.sh new file mode 100644 index 0000000..fca1faa --- /dev/null +++ b/tools/kal-test/bin/compare/graph/lpa_compare.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -e + +case "$1" in +-h | --help | ?) + echo "Usage: " + echo "1st argument: path of dataset" + echo "2nd argument: path of baseline result" + echo "3rd argument: path of algorithm result" + exit 0 + ;; +esac + +input=$1 +path0=$2 +path1=$3 + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +spark-submit \ +--class com.bigdata.compare.graph.LpaVerify \ +--master yarn \ +--num-executors 29 \ +--executor-memory 35g \ +--executor-cores 8 \ +--driver-memory 50g \ +--jars "lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +./lib/kal-test_${scala_version_val}-0.1.jar ${input} ${path0} ${path1} \ No newline at end of file diff --git a/tools/kal-test/bin/compare/graph/node2vec_compare.sh b/tools/kal-test/bin/compare/graph/node2vec_compare.sh new file mode 100644 index 0000000..8ed8a6f --- /dev/null +++ b/tools/kal-test/bin/compare/graph/node2vec_compare.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set -e + +case "$1" in +-h | --help | ?) + echo "Usage: " + echo "1st argument: path of algorithm dataset" + echo "2nd argument: path of ground truth" + echo "3rd argument: path of algorithm result" + echo "4th argument: partition num, default value 240" + exit 0 + ;; +esac +input=$1 +ground=$2 +output=$3 +part=$4 + +source conf/graph/graph_datasets.properties +scala_version=scalaVersion +scala_version_val=${!scala_version} + +spark-submit \ +--class com.bigdata.compare.graph.Node2vecVerify \ +--master yarn \ +--num-executors 3 \ +--executor-memory 315g \ +--executor-cores 93 \ +--driver-memory 300g \ +--conf "spark.driver.maxResultSize=300g" \ +--conf spark.driver.extraJavaOptions="-Xms300g" \ +--jars "lib/smile-core-2.5.3.jar,lib/smile-math-2.5.3.jar" \ +./lib/kal-test_${scala_version_val}-0.1.jar ${input} ${ground} ${output} ${part:-240} \ No newline at end of file diff --git a/tools/kal-test/bin/compare/graph/ppr_compare.sh b/tools/kal-test/bin/compare/graph/ppr_compare.sh new file mode 100644 index 0000000..56047f9 --- /dev/null +++ b/tools/kal-test/bin/compare/graph/ppr_compare.sh @@ -0,0 +1,30 @@ +#!/bin/bash +set -e + +case "$1" in +-h | --help | ?) + echo "Usage: " + echo "1st argument: path of baseline result" + echo "2nd argument: path of algorithm result" + echo "3rd argument: name of api: fixMS,fixSS,conSS" + echo "4th argument: sourceCnt: 1,5,10,50,100, default value 240" + exit 0 + ;; +esac +path0=$1 +path1=$2 +api=$3 +src=$4 + +source conf/graph/graph_datasets.properties +scala_version=scalaVersion +scala_version_val=${!scala_version} + +spark-submit \ +--class com.bigdata.compare.graph.PersonalizedPageRankVerify \ +--master yarn \ +--num-executors 29 \ +--executor-memory 35g \ +--executor-cores 8 \ +--driver-memory 50g \ +./lib/kal-test_${scala_version_val}-0.1.jar ${path0} ${path1} ${api} ${src:-1} \ No newline at end of file diff --git a/tools/kal-test/bin/compare/ml/idf_compare.sh b/tools/kal-test/bin/compare/ml/idf_compare.sh new file mode 100644 index 0000000..f970583 --- /dev/null +++ b/tools/kal-test/bin/compare/ml/idf_compare.sh @@ -0,0 +1,42 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: path of opt result: eg [hdfs:///tmp/ml/result/IDF/D2g250m]" + echo "2nd argument: path of raw result: eg [hdfs:///tmp/ml/result/IDF/D2g250m_raw]" + echo "Applicable to algorithm IDF" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 2 ]; then + usage + exit 0 +fi + +path0=$1 +path1=$2 + +source conf/ml/ml_datasets.properties +scala_version=scalaVersion +scala_version_val=${!scala_version} + +spark-submit \ +--class com.bigdata.compare.ml.IDFVerify \ +--master yarn \ +--deploy-mode client \ +--driver-cores 36 \ +--driver-memory 50g \ +--num-executors 12 \ +--executor-cores 23 \ +--executor-memory 79g \ +--conf "spark.executor.extraJavaOptions=-Xms79g" \ +--conf "spark.driver.maxResultSize=256g" \ +--driver-java-options "-Xms15g" \ +./lib/kal-test_${scala_version_val}-0.1.jar ${path0} ${path1} \ No newline at end of file diff --git a/tools/kal-test/bin/compare/ml/knn_compare.sh b/tools/kal-test/bin/compare/ml/knn_compare.sh new file mode 100644 index 0000000..cc02eed --- /dev/null +++ b/tools/kal-test/bin/compare/ml/knn_compare.sh @@ -0,0 +1,109 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: e.g. glove" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 1 ]; then + usage + exit 0 +fi + +dataset_name=$1 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + +numExe=15 +exeCore=10 +exeMem=50 + +groundTruthLocalPath="result/KNN/${dataset_name}_truth" +groundTruthHDFSPath="${save_resultPath_val}/KNN/${dataset_name}_truth" +testNum=50000 +testBatchSize=5000 +k=100 +pt=150 + +rm -rf ${groundTruthLocalPath} +mkdir -p ${groundTruthLocalPath} + +echo "--------------- 生成真实解 ---------------" +spark-submit \ +--class com.bigdata.compare.ml.KNNVerify \ +--driver-class-path "./lib/kal-test_${scala_version_val}-0.1.jar" \ +--master yarn \ +--deploy-mode client \ +--num-executors ${numExe} \ +--executor-cores ${exeCore} \ +--executor-memory ${exeMem}g \ +--driver-cores 50 \ +--driver-memory 50g \ +--conf "spark.executor.extraJavaOptions=-Xms${exeMem}g" \ +--conf "spark.driver.maxResultSize=256G" \ +--conf "spark.scheduler.mode=FAIR" \ +--conf "spark.network.timeout=10000000" \ +--conf "spark.executor.heartbeatInterval=1000" \ +--conf "spark.scheduler.maxRegisteredResourcesWaitingTime=24h" \ +--conf "spark.scheduler.minRegisteredResourcesRatio=1.0" \ +./lib/kal-test_${scala_version_val}-0.1.jar \ +--task "write" \ +--pt ${pt} \ +--k ${k} \ +--testNum ${testNum} \ +--testBatchSize ${testBatchSize} \ +--dataPath ${data_path_val} \ +--groundTruthLocalPath ${groundTruthLocalPath} + +hadoop fs -mkdir -p ${groundTruthHDFSPath} +hadoop fs -rm -r ${groundTruthHDFSPath} +hadoop fs -put ${groundTruthLocalPath} ${groundTruthHDFSPath} + + +echo "--------------- 生成自研算法结果,并与真实解做对比 ---------------" +spark-submit \ +--class com.bigdata.compare.ml.KNNVerify \ +--jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--master yarn \ +--deploy-mode client \ +--num-executors ${numExe} \ +--executor-cores ${exeCore} \ +--executor-memory ${exeMem}g \ +--driver-cores 50 \ +--driver-memory 50g \ +--conf "spark.executor.extraJavaOptions=-Xms${exeMem}g" \ +--conf "spark.driver.maxResultSize=256G" \ +--conf "spark.scheduler.mode=FAIR" \ +--conf "spark.network.timeout=10000000" \ +--conf "spark.executor.heartbeatInterval=1000" \ +--conf "spark.scheduler.maxRegisteredResourcesWaitingTime=24h" \ +--conf "spark.scheduler.minRegisteredResourcesRatio=1.0" \ +./lib/kal-test_${scala_version_val}-0.1.jar \ +--task "verify" \ +--pt ${pt} \ +--k ${k} \ +--testNum ${testNum} \ +--testBatchSize ${testBatchSize} \ +--dataset_name ${dataset_name} \ +--dataPath ${data_path_val} \ +--groundTruthHDFSPath ${groundTruthHDFSPath} \ No newline at end of file diff --git a/tools/kal-test/bin/compare/ml/major_compare.sh b/tools/kal-test/bin/compare/ml/major_compare.sh new file mode 100644 index 0000000..b79b0f9 --- /dev/null +++ b/tools/kal-test/bin/compare/ml/major_compare.sh @@ -0,0 +1,60 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: algorithm name: als kmeans linr svm dtr(dt.regression) gbdtr(gbdt.regression) rfr(rf.regression) xgbtr(xgbt.regression) + lda logr svm dtc(dt.classification) gbdtc(gbdt.classification) rfc(rf.classification) xgbtc(xgbt.classification) + cov pca pearson spca spearman lda ps svd dtb" + echo "2st argument: path of opt result: eg [hdfs:///tmp/ml/result/RF/classification_epsilon_dataframe_fit1]" + echo "3nd argument: path of raw result: eg [hdfs:///tmp/ml/result/RF/classification_epsilon_dataframe_fit1_raw]" + echo "Applicable to algorithm ALS KMeans LinR SVM GBDT.regression RF.regression XGBT.regression" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 3 ]; then + usage + exit 0 +fi + +alg=$1 +path0=$2 +path1=$3 + +if [ $alg == "als" ] || [ $alg == "kmeans" ]|| [ $alg == "linr" ]|| [ $alg == "svm" ]|| [ $alg == "dtr" ]|| [ $alg == "gbdtr" ]|| [ $alg == "rfr" ]|| [ $alg == "xgbtr" ]; then + class_name=com.bigdata.compare.ml.DownEvaluationVerify +elif [ $alg == "logr" ] || [ $alg == "svm" ] || [ $alg == "dtc" ] || [ $alg == "gbdtc" ] || [ $alg == "rfc" ] || [ $alg == "xgbtc" ] ; then + class_name=com.bigdata.compare.ml.UpEvaluationVerify +elif [ $alg == "cov" ] || [ $alg == "pca" ] || [ $alg == "pearson" ] || [ $alg == "spca" ] || [ $alg == "spearman" ]; then + class_name=com.bigdata.compare.ml.MatrixVerify +elif [ $alg == "lda" ]; then + class_name=com.bigdata.compare.ml.LDAVerify +elif [ $alg == "ps" ]; then + class_name=com.bigdata.compare.ml.PrefixSpanVerify +elif [ $alg == "svd" ] ; then + class_name=com.bigdata.compare.ml.SVDVerify +elif [ $alg == "dtb" ] ; then + class_name=com.bigdata.compare.ml.DTBVerify +else + alg_usage + exit 0 +fi + +source conf/ml/ml_datasets.properties +scala_version=scalaVersion +scala_version_val=${!scala_version} + +spark-submit \ +--class ${class_name} \ +--master yarn \ +--num-executors 29 \ +--executor-memory 35g \ +--executor-cores 8 \ +--driver-memory 50g \ +./lib/kal-test_${scala_version_val}-0.1.jar ${path0} ${path1} \ No newline at end of file diff --git a/tools/kal-test/bin/compare/ml/simrank_compare.sh b/tools/kal-test/bin/compare/ml/simrank_compare.sh new file mode 100644 index 0000000..d1dba47 --- /dev/null +++ b/tools/kal-test/bin/compare/ml/simrank_compare.sh @@ -0,0 +1,39 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: path of SimRank opt result: eg [hdfs:///tmp/ml/result/SimRank/simrank3w]" + echo "2nd argument: path of SimRank raw result: eg [hdfs:///tmp/ml/result/SimRank/simrank3w]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 2 ]; then + usage + exit 0 +fi + +path0=$1 +path1=$2 + +source conf/ml/ml_datasets.properties +scala_version=scalaVersion +scala_version_val=${!scala_version} + +spark-submit \ +--driver-java-options "-Dlog4j.configuration=file:./log4j.properties" \ +--class com.bigdata.compare.ml.SimRankVerify \ +--master yarn \ +--deploy-mode client \ +--driver-cores 36 \ +--driver-memory 50g \ +--num-executors 71 \ +--executor-memory 12g \ +--executor-cores 4 \ +./lib/kal-test_${scala_version_val}-0.1.jar ${path0} ${path1} diff --git a/tools/kal-test/bin/graph/betweenness_run.sh b/tools/kal-test/bin/graph/betweenness_run.sh new file mode 100644 index 0000000..b5d1ad2 --- /dev/null +++ b/tools/kal-test/bin/graph/betweenness_run.sh @@ -0,0 +1,195 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of dataset: cit_patents, enwiki_2018, uk_2002" + echo "2nd argument: optimization algorithm or raw: no/yes" + echo "3rd argument: verify result: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 3 ];then + alg_usage + exit 0 +fi + +source conf/graph/betweenness/betweenness_spark.properties + +dataset_name=$1 +is_raw=$2 +is_check=$3 + +if [ ${dataset_name} != "cit_patents" ] && + [ ${dataset_name} != "enwiki_2018" ] && + [ ${dataset_name} != "uk_2002" ] ;then + echo "invalid dataset name,dataset name:cit_patents, enwiki_2018, uk_2002" + exit 1 +fi + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +# concatnate strings as a new variable +num_executors="${dataset_name}_numExecutors_${cpu_name}" +executor_cores="${dataset_name}_executorCores_${cpu_name}" +executor_memory="${dataset_name}_executorMemory_${cpu_name}" +num_partitions="${dataset_name}_numPartitions_${cpu_name}" +spark_task_cpus="${dataset_name}_SparkTaskCpus_${cpu_name}" +thread_num="${dataset_name}_ThreadNum_${cpu_name}" +betweenness_part_num="${dataset_name}_BetweennessPartNum_${cpu_name}" +pivots="${dataset_name}_pivots" +iteration="${dataset_name}_iteration" +graph_split="${dataset_name}_graphSplit" +deploy_mode="deployMode" +driver_memory="driverMemory" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +deploy_mode_val=${!deploy_mode} +num_partitions_val=${!num_partitions} +driver_memory_val=${!driver_memory} + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${num_partitions} : ${num_partitions_val}" +echo "${driver_memory}:${driver_memory_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${num_partitions_val} ] ; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +if [ ${is_raw} == "no" ]; then + spark_task_cpus_val=${!spark_task_cpus} + thread_num_val=${!thread_num} + betweenness_part_num_val=${!betweenness_part_num} + + echo "${spark_task_cpus}:${spark_task_cpus_val}" + echo "${thread_num}:${thread_num_val}" + echo "${betweenness_part_num}:${betweenness_part_num_val}" + + if [ ! ${spark_task_cpus_val} ] || + [ ! ${thread_num_val} ] || + [ ! ${betweenness_part_num_val} ] ; then + echo "Some values are NULL, please confirm with the property files" + exit 0 + fi +else + pivots_val=${!pivots} + iteration_val=${!iteration} + graph_split_val=${!graph_split} + + echo "${pivots}:${pivots_val}" + echo "${iteration}:${iteration_val}" + echo "${graph_split}:${graph_split_val}" + + if [ ! ${pivots_val} ] || + [ ! ${iteration_val} ] || + [ ! ${graph_split_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 + fi +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +gt_path="${dataset_name}_gt" +data_path_val=${!dataset_name} +gt_path_val=${!gt_path} +output_path="${output_path_prefix}/betweenness/${is_raw}/${dataset_name}" +hdfs dfs -rm -r -f ${output_path} + +echo "${dataset_name} : ${data_path_val}" +echo "${gt_path} : ${gt_path_val}" +echo "output_path : ${output_path}" + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs--betweenness_${dataset_name}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.BetweennessRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory ${driver_memory_val} \ + --conf spark.rpc.message.maxSize=2046 \ + --conf spark.worker.timeout=3600 \ + --conf spark.driver.maxResultSize=200g \ + --conf spark.rpc.askTimeout=36000 \ + --conf spark.network.timeout=6000s \ + --conf spark.broadcast.blockSize=4m \ + --conf spark.shuffle.manager=SORT \ + --conf spark.shuffle.blockTransferService=nio \ + --conf spark.locality.wait.node=0 \ + --conf spark.boostkit.bigdata.graph.betweenness.partnum=${betweenness_part_num_val} \ + --conf spark.boostkit.bigdata.graph.betweenness.threadnum=${thread_num_val} \ + --conf spark.task.cpus=${spark_task_cpus_val} \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${is_raw} ${num_partitions_val} ${data_path_val} ${is_check} ${output_path} ${gt_path_val} | tee ./log/log +else + spark-submit \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --name "Betweenness_${dataset_name}_opensource" \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory ${driver_memory_val} \ + --conf spark.kryoserializer.buffer.max=2047m \ + --conf spark.driver.maxResultSize=0 \ + --conf spark.ui.showConsoleProgress=true \ + --conf spark.driver.extraJavaOptions="-Xms${driver_memory_val} -XX:hashCode=0" \ + --conf spark.executor.extraJavaOptions="-Xms${executor_memory_val} -XX:hashCode=0" \ + --conf spark.rpc.askTimeout=1000000s \ + --conf spark.network.timeout=1000000s \ + --conf spark.executor.heartbeatInterval=100000s \ + --conf spark.rpc.message.maxSize=1000 \ + --jars "./lib/scopt_2.11-3.2.0.jar" \ + ./lib/hbse_2.11-0.1.jar \ + -m yarn \ + -s ${graph_split_val} \ + -n ${num_partitions_val} \ + -i ${data_path_val} \ + -o ${output_path} \ + -g ${gt_path_val} \ + -p ${pivots_val} \ + -b ${iteration_val} > betweenness_temp.log + CostTime=$(cat betweenness_temp.log |grep "CostTime of Top-K" | awk '{print $6}') + Accuracy=$(cat betweenness_temp.log |grep "Accuracy of Top-K" | awk '{print $6}') + currentTime=$(date "+%Y%m%d_H%M%S") + rm -rf betweenness_temp.log + echo -e "algorithmName: Betweenness\ncostTime: $CostTime\ndatasetName: ${dataset_name}\nisRaw: 'yes'\nAccuracy: ${Accuracy}\ntestcaseType: Betweenness_opensource_${dataset_name}\n" > ./report/"Betweenness_${currentTime}.yml" + echo "Exec Successful: end." > ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph/bfs_run.sh b/tools/kal-test/bin/graph/bfs_run.sh new file mode 100644 index 0000000..4c43e98 --- /dev/null +++ b/tools/kal-test/bin/graph/bfs_run.sh @@ -0,0 +1,183 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage:" + echo "1st argument: name of dataset: cit_patents,enwiki_2018,arabic_2005,graph500_22,graph500_23,graph500_25" + echo "2nd argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 2 ];then + alg_usage + exit 0 +fi + +source conf/graph/bfs/bfs_spark.properties +source conf/graph/bfs/bfs_source_id.properties + +dataset_name=$1 +is_raw=$2 + +if [ ${dataset_name} != "cit_patents" ] && + [ ${dataset_name} != "enwiki_2018" ] && + [ ${dataset_name} != "arabic_2005" ] && + [ ${dataset_name} != "graph500_22" ] && + [ ${dataset_name} != "graph500_23" ] && + [ ${dataset_name} != "graph500_25" ] ;then + echo "invalid dataset name,dataset name:cit_patents,enwiki_2018,arabic_2005,graph500_22,graph500_23,graph500_25" + exit 1 +fi + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +# concatnate strings as a new variable +num_executors="${dataset_name}_numExecutors_${cpu_name}" +executor_cores="${dataset_name}_executorCores_${cpu_name}" +executor_memory="${dataset_name}_executorMemory_${cpu_name}" +num_partitions="${dataset_name}_numPartitions_${cpu_name}" +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +deploy_mode_val=${!deploy_mode} +num_partitions_val=${!num_partitions} +extra_java_options_val="-Xms${executor_memory_val}" + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${num_partitions} : ${num_partitions_val}" +echo "extra_java_options_val : ${extra_java_options_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${extra_java_options_val} ] || + [ ! ${num_partitions_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +output_path="${output_path_prefix}/bfs/${is_raw}/${dataset_name}" + +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" +echo "output_path : ${output_path}" + +source_ids="${dataset_name}_SourceID" +source_ids_val=${!source_ids} +source_ids_arr=($source_ids_val) +echo "${source_ids}:${source_ids_val}" + +if [ ${is_raw} != "no" ]; then + split="${dataset_name}_split" + q="${dataset_name}_q" + split_val=${!split} + q_val=${!q} + echo "${split}: ${split_val}" + echo "${q}: ${q_val}" +fi + +for source_id in ${source_ids_arr[@]} +do + hdfs dfs -rm -r -f "${output_path}_${source_id}" + + echo "start to clean cache and sleep 30s" + ssh server1 "echo 3 > /proc/sys/vm/drop_caches" + ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" + ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" + ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" + sleep 30 + + echo "start to submit spark jobs -- bfs-${dataset_name}_${source_id}" + if [ ${is_raw} == "no" ]; then + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.BFSRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 200g \ + --conf "spark.driver.maxResultSize=200g" \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.driver.extraJavaOptions=-Xms200g" \ + --conf spark.locality.wait.node=0 \ + --jars "lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${source_id} ${num_partitions_val} ${is_raw} ${data_path_val} "${output_path}_${source_id}" | tee ./log/log + else + spark-submit \ + --class Driver \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 80g \ + --conf "spark.driver.maxResultSize=80g" \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.driver.extraJavaOptions=-Xms80g" \ + --conf spark.locality.wait.node=0 \ + --jars "lib/scopt_2.10-3.5.0.jar" \ + --driver-class-path "lib/scopt_2.10-3.5.0.jar" \ + --conf "spark.executor.extraClassPath=scopt_2.10-3.5.0.jar" \ + ./lib/bfs_2.10-0.1.2.jar \ + -g 'EdgeList' \ + -p 'EdgePartition2D' \ + -n ${num_partitions_val} \ + -i ${source_id} \ + -d 10 \ + -l 'yarn' \ + -f "${data_path_val}" \ + -t "${split_val}" \ + -o "${output_path}_${source_id}" \ + -z ${num_partitions_val} \ + -q ${q_val} > bfs_temp_1.log + + cat bfs_temp_1.log | grep "BFS: Time:" > bfs_temp_2.log + if [ -s bfs_temp_2.log ];then + echo "start to report result" + else + exit 1 + fi + + while read line + do + echo $line + IFS=" " + line_arr=($line) + time_temp=${line_arr[${#line_arr[@]}-1]} + bfs_time=$(echo -e "scale=4;${time_temp}/1000" | bc) + echo "CostTime(s): ${bfs_time}" + if [ ! -d "./report" ]; then + mkdir report + fi + currentTime=$(date "+%Y%m%d_%H%M%S") + echo -e "algorithmName: BFS\ncostTime: $bfs_time\ndatasetName: ${dataset_name}\nisRaw: 'yes'\nsourceID: ${source_id}\ntestcaseType: BFS_opensource_${dataset_name}_${source_id}\n" > ./report/"BFS_${currentTime}.yml" + done < bfs_temp_2.log + rm -rf "bfs_temp_1.log" + rm -rf "bfs_temp_2.log" + fi +done diff --git a/tools/kal-test/bin/graph/cc_run.sh b/tools/kal-test/bin/graph/cc_run.sh new file mode 100644 index 0000000..204ffc6 --- /dev/null +++ b/tools/kal-test/bin/graph/cc_run.sh @@ -0,0 +1,130 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of dataset: graph500_25,graph500_26,liveJournal" + echo "2nd argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 2 ];then + alg_usage + exit 0 +fi + +dataset_name=$1 +is_raw=$2 + +if [ $dataset_name != 'graph500_25' ] && [ $dataset_name != 'graph500_26' ] && [ $dataset_name != 'liveJournal' ]; +then + echo 'invalid dataset' + echo "dataset name: graph500_25 or graph500_26 or liveJournal" + exit 0 +fi + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +source conf/graph/cc/cc_spark.properties +num_executors_val="numExecutors_${cpu_name}" +executor_cores_val="executorCores_${cpu_name}" +executor_memory_val="executorMemory_${cpu_name}" +executor_extra_javaopts_val="executorExtraJavaopts_${cpu_name}" +default_parallelism_val="defaultParallelism_${dataset_name}_${cpu_name}" + +master_val="master" +deploy_mode_val="deployMode" +driver_memory_val="driverMemory" +num_executors=${!num_executors_val} +executor_cores=${!executor_cores_val} +executor_memory=${!executor_memory_val} +master=${!master_val} +driver_memory=${!driver_memory_val} +deploy_mode=${!deploy_mode_val} +executor_extra_javaopts=${!executor_extra_javaopts_val} +default_parallelism=${!default_parallelism_val} +if [ ! ${num_executors} ] \ + || [ ! ${executor_cores} ] \ + || [ ! ${executor_memory} ] \ + || [ ! ${master} ]; then + echo "Some values are NUll, please confirm with the property files" + exit 0 +fi +echo "${master_val}:${master}" +echo "${deploy_mode_val}:${deploy_mode}" +echo "${num_executors_val}:${num_executors}" +echo "${executor_cores_val}:${executor_cores}" +echo "${executor_memory_val}:${executor_memory}" +echo "${executor_extra_javaopts_val}:${executor_extra_javaopts}" +echo "${default_parallelism_val}:${default_parallelism}" + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +input_path=${!dataset_name} +output_path="${output_path_prefix}/cc/${is_raw}/${dataset_name}" +echo "${dataset_name}: ${input_path},${output_path}" + +echo "start to clean exist output" +hdfs dfs -rm -r -f -skipTrash ${output_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs -- cc-${dataset_name}" + +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.ConnectedComponentsRunner \ + --deploy-mode ${deploy_mode} \ + --driver-memory ${driver_memory} \ + --num-executors ${num_executors} \ + --executor-cores ${executor_cores} \ + --executor-memory ${executor_memory} \ + --conf "spark.executor.extraJavaOptions=${executor_extra_javaopts}" \ + --conf spark.rpc.message.maxSize=1000 \ + --conf spark.driver.maxResultSize=100g \ + --conf spark.locality.wait.node=0 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.kryoserializer.buffer=48m \ + --conf spark.default.parallelism=${default_parallelism} \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${input_path} ${output_path} ${is_raw} ${cpu_name} | tee ./log/log +else + spark-submit \ + --class com.bigdata.graph.ConnectedComponentsRunner \ + --deploy-mode ${deploy_mode} \ + --driver-memory ${driver_memory} \ + --num-executors ${num_executors} \ + --executor-cores ${executor_cores} \ + --executor-memory ${executor_memory} \ + --conf "spark.executor.extraJavaOptions=${executor_extra_javaopts}" \ + --conf spark.rpc.message.maxSize=1000 \ + --conf spark.driver.maxResultSize=100g \ + --conf spark.locality.wait.node=0 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.kryoserializer.buffer=48m \ + --driver-class-path "lib/kal-test_2.11-0.1.jar:lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${input_path} ${output_path} ${is_raw} ${cpu_name} | tee ./log/log +fi diff --git a/tools/kal-test/bin/graph/cd_run.sh b/tools/kal-test/bin/graph/cd_run.sh new file mode 100644 index 0000000..6e94cbc --- /dev/null +++ b/tools/kal-test/bin/graph/cd_run.sh @@ -0,0 +1,113 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage:" + echo "dataset name: simulate1,simulate2,usaRoad" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 1 ];then + alg_usage + exit 0 +fi + +dataset_name=$1 +if [ $dataset_name != 'simulate1' ] && [ $dataset_name != 'simulate2' ] && [ $dataset_name != 'usaRoad' ]; +then + echo 'invalid dataset' + echo "please input dataset name: simulate1 or simulate2 or usaRoad" + exit 0 +fi + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +source conf/graph/cd/cd_spark.properties +num_executors_val="numExecutors_${dataset_name}_${cpu_name}" +executor_cores_val="executorCores_${dataset_name}_${cpu_name}" +executor_memory_val="executorMemory_${dataset_name}_${cpu_name}" +executor_extra_javaopts_val="executorExtraJavaopts_${dataset_name}_${cpu_name}" + +master_val="master" +deploy_mode_val="deployMode" +driver_memory_val="driverMemory" +num_executors=${!num_executors_val} +executor_cores=${!executor_cores_val} +executor_memory=${!executor_memory_val} +master=${!master_val} +driver_memory=${!driver_memory_val} +deploy_mode=${!deploy_mode_val} +executor_extra_javaopts=${!executor_extra_javaopts_val} +if [ ! ${num_executors} ] \ + || [ ! ${executor_cores} ] \ + || [ ! ${executor_memory} ] \ + || [ ! ${master} ]; then + echo "Some values are NUll, please confirm with the property files" + exit 0 +fi +echo "${master_val}:${master}" +echo "${deploy_mode_val}:${deploy_mode}" +echo "${num_executors_val}:${num_executors}" +echo "${executor_cores_val}:${executor_cores}" +echo "${executor_memory_val}:${executor_memory}" +echo "${executor_extra_javaopts_val}:${executor_extra_javaopts}" + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +input_path=${!dataset_name} +output_path="${output_path_prefix}/cd/${dataset_name}" +echo "${dataset_name}: ${input_path},${output_path}" + +echo "start to clean exist output" +hdfs dfs -rm -r -f -skipTrash ${output_path} + +scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ +scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ +scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs -- cd-${dataset_name}" +spark-submit \ +--class com.bigdata.graph.CycleDetectionWithConstrainsRunner \ +--deploy-mode ${deploy_mode} \ +--driver-memory ${driver_memory} \ +--num-executors ${num_executors} \ +--executor-cores ${executor_cores} \ +--executor-memory ${executor_memory} \ +--conf spark.rpc.askTimeout=3600 \ +--conf spark.scheduler.maxRegisteredResourcesWaitingTime=3600000 \ +--conf spark.worker.timeout=3600 \ +--conf spark.network.timeout=6000s \ +--conf spark.storage.blockManagerSlaveTimeoutMs=600000 \ +--conf spark.shuffle.blockTransferService=nio \ +--conf spark.driver.maxResultSize=100g \ +--conf spark.shuffle.manager=SORT \ +--conf spark.broadcast.blockSize=25g \ +--conf spark.core.connection.ack.wait.timeout=60000s \ +--conf spark.shuffle.memoryFraction=0.6 \ +--conf spark.storage.memoryFraction=0.2 \ +--conf spark.rdd.compress=true \ +--conf spark.executor.memoryOverhead=5g \ +--conf "spark.executor.extraJavaOptions=${executor_extra_javaopts}" \ +--jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${input_path} ${output_path} "run" "no" ${cpu_name} | tee ./log/log \ No newline at end of file diff --git a/tools/kal-test/bin/graph/closeness_run.sh b/tools/kal-test/bin/graph/closeness_run.sh new file mode 100644 index 0000000..5e8a683 --- /dev/null +++ b/tools/kal-test/bin/graph/closeness_run.sh @@ -0,0 +1,129 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of dataset: name of dataset: cit_patents,uk_2002" + echo "2nd argument: weight or not: e.g. weighted,unweighted" + echo "3rd argument: verify result: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 3 ];then + alg_usage + exit 0 +fi + +source conf/graph/closeness/closeness_spark.properties + +dataset_name=$1 +weight=$2 +is_check=$3 + +if [ ${dataset_name} != "cit_patents" ] && + [ ${dataset_name} != "uk_2002" ] ;then + echo "invalid dataset name,dataset name:cit_patents,uk_2002" + exit 1 +fi +if [ ${weight} != "weighted" ] && [ ${weight} != "unweighted" ];then + echo "invalid argument value,must be: weighted or unweighted" + exit 1 +fi + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +# concatnate strings as a new variable +num_executors="${dataset_name}_${weight}_numExecutors_${cpu_name}" +executor_cores="${dataset_name}_${weight}_executorCores_${cpu_name}" +executor_memory="${dataset_name}_${weight}_executorMemory_${cpu_name}" +num_partitions="${dataset_name}_${weight}_numPartitions_${cpu_name}" +ratio="${dataset_name}_${weight}_ratio_${cpu_name}" +output_node_num="outputNodeNum" +deploy_mode="deployMode" +driver_memory="driverMemory" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +deploy_mode_val=${!deploy_mode} +num_partitions_val=${!num_partitions} +ratio_val=${!ratio} +output_node_num_val=${!output_node_num} +driver_memory_val=${!driver_memory} + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${num_partitions} : ${num_partitions_val}" +echo "${ratio} : ${ratio_val}" +echo "${output_node_num} : ${output_node_num_val}" +echo "${driver_memory}:${driver_memory_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${num_partitions_val} ] || + [ ! ${ratio_val} ] || + [ ! ${split} ] || + [ ! ${output_node_num_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +gt_path="closeness_gt_${dataset_name}" +data_path_val=${!dataset_name} +gt_path_val=${!gt_path} +output_path="${output_path_prefix}/closeness/${dataset_name}_${weight}" +echo "${dataset_name} : ${data_path_val}" +echo "output_path : ${output_path}" +echo "${gt_path} : ${gt_path_val}" +hdfs dfs -rm -r -f ${output_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs -- closeness-${dataset_name}" +scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ +scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ +scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + +spark-submit \ +--class com.bigdata.graph.ClosenessRunner \ +--master yarn \ +--deploy-mode ${deploy_mode_val} \ +--num-executors ${num_executors_val} \ +--executor-memory ${executor_memory_val} \ +--executor-cores ${executor_cores_val} \ +--driver-memory ${driver_memory_val} \ +--conf spark.worker.timeout=3600 \ +--conf spark.driver.maxResultSize=200g \ +--conf spark.rpc.askTimeout=36000 \ +--conf spark.rdd.compress=true \ +--conf spark.network.timeout=6000s \ +--conf spark.broadcast.blockSize=4m \ +--conf spark.shuffle.manager=SORT \ +--conf spark.shuffle.blockTransferService=nio \ +--conf spark.locality.wait.node=0 \ +--jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${num_partitions_val} ${weight} ${output_node_num_val} ${ratio_val} "no" ${data_path_val} ${output_path} ${split} ${gt_path_val} ${is_check} | tee ./log/log diff --git a/tools/kal-test/bin/graph/closeness_run_hive.sh b/tools/kal-test/bin/graph/closeness_run_hive.sh new file mode 100644 index 0000000..67ce8c3 --- /dev/null +++ b/tools/kal-test/bin/graph/closeness_run_hive.sh @@ -0,0 +1,71 @@ +#!/bin/bash +set -e + +case "$1" in +-h | --help | ?) + echo "Usage:

" + exit 0 + ;; +esac + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +if [ ${cpu_name} == "aarch64" ] +then + cpu_name="aarch_64" +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +table_name=$1 +col1=$2 +col2=$3 +colWeight=$4 +weighted=$5 +k=$6 +p=$7 +partition=$8 +save_mode=$9 +save_arg=${10} + +echo "table_name: $table_name" +echo "col1: $col1" +echo "col2: $col2" +echo "colWeight: $colWeight" +echo "weighted: $weighted" +echo "k: $k" +echo "p: $p" +echo "partition: $partition" +echo "save_mode: $save_mode" +echo "save_arg: $save_arg" + +spark-submit \ +--class com.bigdata.graph.ClosenessHiveRunner \ +--master yarn \ +--deploy-mode "client" \ +--num-executors 35 \ +--executor-memory "25g" \ +--executor-cores 4 \ +--driver-memory "16g" \ +--conf spark.worker.timeout=3600 \ +--conf spark.driver.maxResultSize=200g \ +--conf spark.rpc.askTimeout=36000 \ +--conf spark.network.timeout=6000s \ +--conf spark.broadcast.blockSize=4m \ +--conf spark.shuffle.manager=SORT \ +--conf spark.shuffle.blockTransferService=nio \ +--conf spark.locality.wait.node=0 \ +--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ +--conf spark.rdd.compress=true \ +--conf spark.shuffle.compress=true \ +--conf spark.shuffle.spill.compress=true \ +--conf spark.io.compression.codec=lz4 \ +--jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +./lib/kal-test_${scala_version_val}-0.1.jar ${table_name} ${col1} ${col2} ${colWeight} ${weighted} ${k} ${p} ${partition} ${save_mode} ${save_arg} diff --git a/tools/kal-test/bin/graph/clusteringcoefficient_run.sh b/tools/kal-test/bin/graph/clusteringcoefficient_run.sh new file mode 100644 index 0000000..73c994e --- /dev/null +++ b/tools/kal-test/bin/graph/clusteringcoefficient_run.sh @@ -0,0 +1,157 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of dataset: name of dataset: cit_patents,uk_2002,arabic_2005,graph500_22,graph500_23,graph500_24,graph500_25" + echo "2nd argument: name of api: lcc,avgcc,globalcc" + echo "3nd argument: weight or not: weighted,unweighted" + echo "4th argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 4 ];then + alg_usage + exit 0 +fi + +source conf/graph/clusteringcoefficient/clusteringcoefficient_spark.properties + +dataset_name=$1 +api_name=$2 +weight=$3 +is_raw=$4 + +if [ ${dataset_name} != "cit_patents" ] && + [ ${dataset_name} != "uk_2002" ] && + [ ${dataset_name} != "arabic_2005" ] && + [ ${dataset_name} != "graph500_22" ] && + [ ${dataset_name} != "graph500_23" ] && + [ ${dataset_name} != "graph500_24" ] && + [ ${dataset_name} != "graph500_25" ] ;then + echo "invalid dataset name,dataset name:cit_patents,uk_2002,arabic_2005,graph500_22,graph500_23,graph500_24,graph500_25" + exit 1 +fi +if [ ${api_name} != "lcc" ] && + [ ${api_name} != "avgcc" ] && + [ ${api_name} != "globalcc" ] ;then + echo "invalid argument value,api name: lcc,avgcc,globalcc" + exit 1 +fi +if [ ${weight} != "weighted" ] && [ ${weight} != "unweighted" ];then + echo "invalid argument value,must be: weighted or unweighted" + exit 1 +fi + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +# concatnate strings as a new variable +num_executors="${dataset_name}_numExecutors_${cpu_name}" +executor_cores="${dataset_name}_executorCores_${cpu_name}" +executor_memory="${dataset_name}_executorMemory_${cpu_name}" +num_partitions="${dataset_name}_numPartitions_${cpu_name}" +deploy_mode="deployMode" +driver_memory="driverMemory" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +deploy_mode_val=${!deploy_mode} +num_partitions_val=${!num_partitions} +driver_memory_val=${!driver_memory} + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${num_partitions} : ${num_partitions_val}" +echo "${driver_memory}:${driver_memory_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${num_partitions_val} ] ; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +data_path_val=${!dataset_name} +output_path="${output_path_prefix}/clusteringcoefficient/${is_raw}/${api_name}/${dataset_name}_${weight}" +echo "${dataset_name} : ${data_path_val}" +echo "output_path : ${output_path}" +hdfs dfs -rm -r -f ${output_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs -- clusteringcoefficient-${api_name}-${weight}-${dataset_name}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.ClusteringCoefficientRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory ${driver_memory_val} \ + --conf spark.executor.memoryOverhead=2048 \ + --conf spark.executor.extraJavaOptions="-Xms12g" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${num_partitions_val} ${weight} ${is_raw} ${data_path_val} ${api_name} ${output_path} | tee ./log/log +else + scp lib/lcc_kaiyuan.jar root@agent1:/opt/graph_classpath/ + scp lib/lcc_kaiyuan.jar root@agent2:/opt/graph_classpath/ + scp lib/lcc_kaiyuan.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.ClusteringCoefficientRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --name "clusteringcoefficient_${dataset_name}_${api_name}" \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory ${driver_memory_val} \ + --conf spark.rpc.askTimeout=36000 \ + --conf spark.scheduler.maxRegisteredResourcesWaitingTime=3600000 \ + --conf spark.worker.timeout=3600 \ + --conf spark.network.timeout=6000s \ + --conf spark.storage.blockManagerSlaveTimeoutMs=600000 \ + --conf spark.shuffle.blockTransferService=nio \ + --conf spark.driver.maxResultSize=100g \ + --conf spark.shuffle.manager=SORT \ + --conf spark.broadcast.blockSize=25g \ + --conf spark.network.timeout=1200s \ + --conf spark.rpc.message.maxSize=2046 \ + --conf spark.core.connection.ack.wait.timeout=60000s \ + --conf spark.executor.extraJavaOptions="-Xms35g" \ + --conf spark.rdd.compress=true \ + --jars "lib/lcc_kaiyuan.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/lcc_kaiyuan.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/lcc_kaiyuan.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${num_partitions_val} ${weight} ${is_raw} ${data_path_val} ${api_name} ${output_path} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph/deepwalk_run.sh b/tools/kal-test/bin/graph/deepwalk_run.sh new file mode 100644 index 0000000..b2cf0a4 --- /dev/null +++ b/tools/kal-test/bin/graph/deepwalk_run.sh @@ -0,0 +1,144 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: cit_patents_deepwalk" + echo "2nd argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 2 ]; then + usage + exit 0 +fi + +source conf/graph/deepwalk/deepwalk_spark.properties + +dataset_name=$1 +is_raw=$2 + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +# concatnate strings as a new variable +num_executors="numExectuors_"${dataset_name}_${cpu_name} +executor_cores="executorCores_"${dataset_name}_${cpu_name} +executor_memory="executorMemory_"${dataset_name}_${cpu_name} +extra_java_options="extraJavaOptions_"${dataset_name}_${cpu_name} +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +deploy_mode_val=${!deploy_mode} + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${deploy_mode} : ${deploy_mode_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${extra_java_options_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +data_path=${dataset_name} +data_path_val=${!data_path} +echo "${dataset_name} : ${data_path_val}" + +model_conf=${dataset_name}-${cpu_name} + +outputPath="/tmp/graph/result/deepwalk/${dataset_name}/${is_raw}" +hdfs dfs -rm -r -f ${outputPath} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- DeepWalk" +if [ ${is_raw} == "no" ]; then + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + +spark-submit \ + --class com.bigdata.graph.DeepWalkRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 300g \ + --conf spark.kryoserializer.buffer.max=2047m \ + --conf spark.ui.showConsoleProgress=true \ + --conf spark.driver.maxResultSize=0 \ + --conf spark.driver.extraJavaOptions="-Xms300G -XX:hashCode=0" \ + --conf spark.executor.extraJavaOptions="-Xms315G -XX:hashCode=0" \ + --conf spark.rpc.askTimeout=1000000s \ + --conf spark.network.timeout=1000000s \ + --conf spark.executor.heartbeatInterval=100000s \ + --conf spark.rpc.message.maxSize=1000 \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${outputPath} ${is_raw} | tee ./log/log + +else + + walkLength="walkLength_"${dataset_name}_${cpu_name} + numWalks="numWalks_"${dataset_name}_${cpu_name} + dimension="dimension_"${dataset_name}_${cpu_name} + partitions="partitions_"${dataset_name}_${cpu_name} + iteration="iteration_"${dataset_name}_${cpu_name} + windowSize="windowSize_"${dataset_name}_${cpu_name} + splitGraph="splitGraph_"${dataset_name}_${cpu_name} + + walkLength_val=${!walkLength} + numWalks_val=${!numWalks} + dimension_val=${!dimension} + partitions_val=${!partitions} + iteration_val=${!iteration} + windowSize_val=${!windowSize} + splitGraph_val=${!splitGraph} + +spark-submit \ + --class com.nrl.SparkedDeepWalkApp \ + --master yarn \ + --num-executors 6 \ + --executor-memory 95g \ + --driver-memory 300g \ + --executor-cores 38 \ + --driver-cores 80 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.kryoserializer.buffer=48m \ + --conf spark.driver.extraJavaOptions="-Xms300g -XX:hashCode=0" \ + --conf spark.executor.extraJavaOptions="-Xms95g -XX:hashCode=0" \ + --conf spark.driver.maxResultSize=0 \ + --conf spark.rpc.askTimeout=1000000s \ + --conf spark.network.timeout=1000000s \ + --conf spark.executor.heartbeatInterval=100000s \ + --conf spark.rpc.message.maxSize=1000 \ + ./lib/sparked-deepwalk_2.11-1.0.jar "" ${data_path_val} "" "" "" "" ${outputPath} ${walkLength_val} ${numWalks_val} ${dimension_val} ${partitions_val} ${iteration_val} ${windowSize_val} ${splitGraph_val} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph/degree_run.sh b/tools/kal-test/bin/graph/degree_run.sh new file mode 100644 index 0000000..cddeeac --- /dev/null +++ b/tools/kal-test/bin/graph/degree_run.sh @@ -0,0 +1,161 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of dataset: it_2004,twitter7,uk_2007_05,mycielskian20,gap_kron,com_friendster" + echo "2nd argument: name of api: degrees,inDegrees,outDegrees" + echo "3rd argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 3 ];then + alg_usage + exit 0 +fi + +source conf/graph/degree/degree_spark.properties + +dataset_name=$1 +api_name=$2 +is_raw=$3 + +if [ ${dataset_name} != "it_2004" ] && + [ ${dataset_name} != "twitter7" ] && + [ ${dataset_name} != "uk_2007_05" ] && + [ ${dataset_name} != "mycielskian20" ] && + [ ${dataset_name} != "gap_kron" ] && + [ ${dataset_name} != "com_friendster" ] ;then + echo "invalid dataset name,dataset name:it_2004,twitter7,uk_2007_05,mycielskian20,gap_kron,com_friendster" + exit 1 +fi + +if [ ${api_name} != "degrees" ] && + [ ${api_name} != "inDegrees" ] && + [ ${api_name} != "outDegrees" ];then + echo "invalid api name,api name: degrees,inDegrees,outDegrees" + exit 1 +fi + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +# concatnate strings as a new variable +num_executors="${api_name}_${dataset_name}_numExecutors_${cpu_name}" +executor_cores="${api_name}_${dataset_name}_executorCores_${cpu_name}" +executor_memory="${api_name}_${dataset_name}_executorMemory_${cpu_name}" +num_partitions="${api_name}_${dataset_name}_numPartitions_${cpu_name}" +split="${dataset_name}_splitGraph" +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +deploy_mode_val=${!deploy_mode} +num_partitions_val=${!num_partitions} +extra_java_options_val="-Xms${executor_memory_val}" + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${num_partitions} : ${num_partitions_val}" +echo "extra_java_options_val : ${extra_java_options_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${extra_java_options_val} ] || + [ ! ${num_partitions_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +data_path_val=${!dataset_name} +output_path="${output_path_prefix}/degree/${is_raw}/${dataset_name}_${api_name}" +echo "${dataset_name} : ${data_path_val}" +echo "output_path : ${output_path}" +hdfs dfs -rm -r -f ${output_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs -- degree-${api_name}-${dataset_name}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.DegreeRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 200g \ + --conf spark.locality.wait.node=0 \ + --conf spark.driver.maxResultSize=0 \ + --conf spark.rpc.askTimeout=1000000s \ + --conf spark.network.timeout=1000000s \ + --conf spark.executor.heartbeatInterval=100000s \ + --conf spark.rpc.message.maxSize=1000 \ + --conf spark.broadcast.blockSize=1m \ + --conf spark.reducer.maxSizeInFlight=59mb \ + --conf spark.shuffle.file.buffer=17k \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.io.compression.codec=lzf \ + --conf spark.shuffle.compress=true \ + --conf spark.rdd.compress=false \ + --conf spark.shuffle.io.preferDirectBufs=true \ + --conf spark.shuffle.spill.compress=true \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar,lib/boostkit-graph-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${api_name} ${num_partitions_val} ${is_raw} ${data_path_val} ${output_path} | tee ./log/log +else + spark-submit \ + --class com.bigdata.graph.DegreeRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 200g \ + --conf spark.locality.wait.node=0 \ + --conf spark.driver.maxResultSize=0 \ + --conf spark.rpc.askTimeout=1000000s \ + --conf spark.network.timeout=1000000s \ + --conf spark.executor.heartbeatInterval=100000s \ + --conf spark.rpc.message.maxSize=1000 \ + --conf spark.broadcast.blockSize=1m \ + --conf spark.reducer.maxSizeInFlight=59mb \ + --conf spark.shuffle.file.buffer=17k \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.io.compression.codec=lzf \ + --conf spark.shuffle.compress=true \ + --conf spark.rdd.compress=false \ + --conf spark.shuffle.io.preferDirectBufs=true \ + --conf spark.shuffle.spill.compress=true \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --driver-class-path "lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${api_name} ${num_partitions_val} ${is_raw} ${data_path_val} ${output_path} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph/fraudar_run.sh b/tools/kal-test/bin/graph/fraudar_run.sh new file mode 100644 index 0000000..c855eec --- /dev/null +++ b/tools/kal-test/bin/graph/fraudar_run.sh @@ -0,0 +1,135 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of dataset: alpha,amazon,otc" + echo "2nd argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 2 ];then + alg_usage + exit 0 +fi + +dataset_name=$1 +is_raw=$2 + +if [ $dataset_name != 'alpha' ] && [ $dataset_name != 'amazon' ] && [ $dataset_name != 'otc' ]; +then + echo 'invalid dataset' + echo "dataset name: alpha or amazon or otc" + exit 0 +fi + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${cpu_name}-${is_raw} + +source conf/graph/fraudar/fraudar_spark.properties +num_executors_val="numExecutors_${dataset_name}_${cpu_name}" +executor_cores_val="executorCores_${dataset_name}_${cpu_name}" +executor_memory_val="executorMemory_${dataset_name}_${cpu_name}" +executor_extra_javaopts_val="executorExtraJavaopts_${dataset_name}_${cpu_name}" + +master_val="master" +deploy_mode_val="deployMode" +driver_memory_val="driverMemory" +num_executors=${!num_executors_val} +executor_cores=${!executor_cores_val} +executor_memory=${!executor_memory_val} +master=${!master_val} +driver_memory=${!driver_memory_val} +deploy_mode=${!deploy_mode_val} +executor_extra_javaopts=${!executor_extra_javaopts_val} +if [ ! ${num_executors} ] \ + || [ ! ${executor_cores} ] \ + || [ ! ${executor_memory} ] \ + || [ ! ${master} ]; then + echo "Some values are NUll, please confirm with the property files" + exit 0 +fi +echo "${master_val}:${master}" +echo "${deploy_mode_val}:${deploy_mode}" +echo "${num_executors_val}:${num_executors}" +echo "${executor_cores_val}:${executor_cores}" +echo "${executor_memory_val}:${executor_memory}" +echo "${executor_extra_javaopts_val}:${executor_extra_javaopts}" + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +input_path=${!dataset_name} + +iset_out_path="/tmp/graph/result/fraudar/${is_raw}/${dataset_name}_i" +jset_out_path="/tmp/graph/result/fraudar/${is_raw}/${dataset_name}_j" +echo "${dataset_name}: ${input_path}" +echo ""outputPath:${iset_out_path},${jset_out_path}"" +echo "start to clean exist output" +hdfs dfs -rm -r -f ${iset_out_path} +hdfs dfs -rm -r -f ${jset_out_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs -- fraudar-${dataset_name}" + +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.FraudarRunner \ + --master ${master} \ + --deploy-mode ${deploy_mode} \ + --driver-memory ${driver_memory} \ + --num-executors ${num_executors} \ + --executor-cores ${executor_cores} \ + --executor-memory ${executor_memory} \ + --conf spark.shuffle.blockTransferService=nio \ + --conf spark.driver.maxResultSize=100g \ + --conf spark.shuffle.manager=SORT \ + --conf spark.broadcast.blockSize=4m \ + --conf spark.locality.wait.node=0 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.rdd.compress=true \ + --conf "spark.executor.extraJavaOptions=${executor_extra_javaopts}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${input_path} ${iset_out_path} ${jset_out_path} | tee ./log/log +else + spark-submit \ + --class com.bigdata.graph.FraudarRunner \ + --deploy-mode ${deploy_mode} \ + --driver-memory ${driver_memory} \ + --num-executors ${num_executors} \ + --executor-cores ${executor_cores} \ + --executor-memory ${executor_memory} \ + --conf spark.shuffle.blockTransferService=nio \ + --conf spark.driver.maxResultSize=100g \ + --conf spark.shuffle.manager=SORT \ + --conf spark.broadcast.blockSize=4m \ + --conf spark.locality.wait.node=0 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.rdd.compress=true \ + --conf "spark.executor.extraJavaOptions=${executor_extra_javaopts}" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${input_path} ${iset_out_path} ${jset_out_path} | tee ./log/log +fi diff --git a/tools/kal-test/bin/graph/inccc_run.sh b/tools/kal-test/bin/graph/inccc_run.sh new file mode 100644 index 0000000..aae6d24 --- /dev/null +++ b/tools/kal-test/bin/graph/inccc_run.sh @@ -0,0 +1,125 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: graph500_26, com_Friendster, webbase_2001" + echo "2nd argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 2 ]; then + usage + exit 0 +fi + +source conf/graph/inccc/inccc_spark.properties + +# 增量数据模拟,增量图和存量图的根路径 +inc_data_root_path=/tmp/graph/incCC/data +# 存量CC的根路径 +orgcc_root_path=/tmp/graph/incCC/orgCC +rate=0.01 + +dataset_name=$1 +is_raw=$2 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${cpu_name}-${is_raw} + +# concatnate strings as a new variable +num_executors="numExectuors_"${dataset_name}_${cpu_name} +executor_cores="executorCores_"${dataset_name}_${cpu_name} +executor_memory="executorMemory_"${dataset_name}_${cpu_name} +extra_java_options="extraJavaOptions_"${dataset_name}_${cpu_name} +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +deploy_mode_val=${!deploy_mode} + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${deploy_mode} : ${deploy_mode_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${extra_java_options_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +data_path=${dataset_name} +data_path_val=${!data_path} +echo "${dataset_name} : ${data_path_val}" + +orgccPath=${orgcc_root_path}/${dataset_name}_${rate}_single +incgraphPath=${inc_data_root_path}/${dataset_name}_${rate}/inc_${rate}_5 +outputPath="/tmp/graph/result/inccc/${dataset_name}/${is_raw}" +hdfs dfs -rm -r -f ${outputPath} + + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- IncConnectedComponents" +if [ ${is_raw} == "no" ]; then + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + +spark-submit \ + --class com.bigdata.graph.IncConnectedComponentsRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 200g \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf spark.rpc.askTimeout=36000 \ + --conf spark.rpc.message.maxSize=1000 \ + --conf spark.akka.timeout=3600 \ + --conf spark.scheduler.maxRegisteredResourcesWaitingTime=3600000 \ + --conf spark.worker.timeout=3600 \ + --conf spark.network.timeout=6000s \ + --conf spark.storage.blockManagerSlaveTimeoutMs=600000 \ + --conf spark.shuffle.blockTransferService=nio \ + --conf spark.driver.maxResultSize=100g \ + --conf spark.shuffle.manager=SORT \ + --conf spark.defalut.parallelism=280 \ + --conf spark.broadcast.blockSize=4m \ + --conf spark.akka.frameSize=2046 \ + --conf spark.locality.wait.node=0 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.kryoserializer.buffer=48m \ + --conf spark.core.connection.ack.wait.timeout=60000s \ + --conf spark.storage.memoryFraction=0.2 \ + --conf spark.shuffle.memoryFraction=0.6 \ + --conf spark.rdd.compress=true \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${outputPath} ${orgccPath} ${incgraphPath} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph/incpr_run.sh b/tools/kal-test/bin/graph/incpr_run.sh new file mode 100644 index 0000000..44f05b2 --- /dev/null +++ b/tools/kal-test/bin/graph/incpr_run.sh @@ -0,0 +1,136 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of dataset: twitter_2010" + echo "2nd argument: rate: e.g. 0.001,0.01,0.05" + echo "3nd argument: batch: e.g. 1,2,3,4,5" + echo "4th argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 4 ];then + alg_usage + exit 0 +fi + +dataset_name=$1 +rate=$2 +batch=$3 +is_raw=$4 + +source conf/graph/incpr/incpr_spark.properties +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +# concatnate strings as a new variable +num_executors="numExectuors" +executor_cores="executorCores" +executor_memory="executorMemory" +extra_java_options="extraJavaOptions" +driver_cores="driverCores" +driver_memory="driverMemory" +executor_memory_overhead="execMemOverhead" +master_="master" +deploy_mode="deployMode" +echo $num_executors +echo $executor_cores +echo $executor_memory +echo $extra_java_options +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +deploy_mode_val=${!deploy_mode} + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${deploy_mode} : ${deploy_mode_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${extra_java_options_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +data_path_val=${!dataset_name}_${rate}_batch_${batch} +output_path="${output_path_prefix}/incpr/${is_raw}/${dataset_name}_${rate}_batch_${batch}" +echo "${dataset_name} : ${data_path_val}" +echo "output_path : ${output_path}" +hdfs dfs -rm -r -f ${output_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs -- incpr-${dataset_name}_${rate}_batch_${batch}" +if [ ${is_raw} == "no" ]; then + spark-submit \ + --class com.bigdata.graph.IncPageRankRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 100g \ + --conf spark.driver.maxResultSize=200g \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.kryoserializer.buffer.max=2040m \ + --conf spark.driver.extraJavaOptions="-Xms100G" \ + --conf spark.rpc.askTimeout=1000000s \ + --conf spark.locality.wait.node=0 \ + --conf spark.network.timeout=1000000s \ + --conf spark.executor.heartbeatInterval=100000s \ + --conf spark.rpc.message.maxSize=1000 \ + --conf spark.memory.fraction=0.5 \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --jars "lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${data_path_val} ${output_path} ${is_raw} | tee ./log/log +else + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.TrillionPageRankRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 80g \ + --conf spark.driver.maxResultSize=80g \ + --conf spark.locality.wait.node=0 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.kryoserializer.buffer.max=2040m \ + --conf spark.rdd.compress=true \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${data_path_val} ${output_path} no | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph/katz_run.sh b/tools/kal-test/bin/graph/katz_run.sh new file mode 100644 index 0000000..7bbe4d6 --- /dev/null +++ b/tools/kal-test/bin/graph/katz_run.sh @@ -0,0 +1,116 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: cit_patents, uk_2002, arabic_2005" + echo "2nd argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 2 ]; then + usage + exit 0 +fi + +source conf/graph/katz/katz_spark.properties + +dataset_name=$1 +is_raw=$2 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${cpu_name}-${is_raw} + +# concatnate strings as a new variable +num_executors="numExectuors_"${dataset_name}_${cpu_name} +executor_cores="executorCores_"${dataset_name}_${cpu_name} +executor_memory="executorMemory_"${dataset_name}_${cpu_name} +extra_java_options="extraJavaOptions_"${dataset_name}_${cpu_name} +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +deploy_mode_val=${!deploy_mode} + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${deploy_mode} : ${deploy_mode_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${extra_java_options_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +data_path=${dataset_name} +data_path_val=${!data_path} +echo "${dataset_name} : ${data_path_val}" + +outputPath="/tmp/graph/result/katz/${dataset_name}/${is_raw}" +hdfs dfs -rm -r -f ${outputPath} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- KatzCentrality" +if [ ${is_raw} == "no" ]; then + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + +spark-submit \ + --class com.bigdata.graph.KatzCentralityRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 200g \ + --conf spark.scheduler.maxRegisteredResourcesWaitingTime=3600000 \ + --conf spark.akka.frameSize=2046 \ + --conf spark.worker.timeout=3600 \ + --conf spark.driver.maxResultSize=200g \ + --conf spark.shuffle.manager=SORT \ + --conf spark.broadcast.blockSize=4m \ + --conf spark.akka.frameSize=3600 \ + --conf spark.locality.wait.node=0 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.kryoserializer.buffer=48m \ + --conf spark.core.connection.ack.wait.timeout=60000s \ + --conf spark.storage.memoryFraction=0.2 \ + --conf spark.shuffle.memoryFraction=0.6 \ + --conf spark.rdd.compress=true \ + --conf spark.shuffle.blockTransferService=nio \ + --conf spark.locality.wait.node=0 \ + --conf spark.network.timeout=6000s \ + --conf spark.rpc.message.maxSize=1000 \ + --conf spark.kryoserializer.buffer.max=2047m \ + --conf spark.defalut.parallelism=340 \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${outputPath} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph/kcore_run.sh b/tools/kal-test/bin/graph/kcore_run.sh new file mode 100644 index 0000000..fbdd90e --- /dev/null +++ b/tools/kal-test/bin/graph/kcore_run.sh @@ -0,0 +1,124 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: dataset name: graph500_22, graph500_23, graph500_25, graph500_26" + echo "2nd argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 2 ];then + alg_usage + exit 0 +fi + +dataset_name=$1 +is_raw=$2 + +if [ $dataset_name != 'graph500_22' ] && [ $dataset_name != 'graph500_23' ] && [ $dataset_name != 'graph500_25' ] && [ $dataset_name != 'graph500_26' ]; +then + echo 'invalid dataset' + echo 'dataset name: graph500_22 or graph500_23 or graph500_25 or graph500_26' + exit 0 +fi + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +source conf/graph/kcore/kcore_spark.properties +num_executors_val="numExecutors_${dataset_name}_${cpu_name}" +executor_cores_val="executorCores_${dataset_name}_${cpu_name}" +executor_memory_val="executorMemory_${dataset_name}_${cpu_name}" +executor_extra_javaopts_val="executorExtraJavaopts_${dataset_name}_${cpu_name}" +executor_emoryOverhead_val="executorMemoryOverhead_${dataset_name}_${cpu_name}" + +master_val="master" +deploy_mode_val="deployMode" +driver_memory_val="driverMemory" +num_executors=${!num_executors_val} +executor_cores=${!executor_cores_val} +executor_memory=${!executor_memory_val} +master=${!master_val} +driver_memory=${!driver_memory_val} +deploy_mode=${!deploy_mode_val} +executor_extra_javaopts=${!executor_extra_javaopts_val} +executor_emoryOverhead=${!executor_emoryOverhead_val} +if [ ! ${num_executors} ] \ + || [ ! ${executor_cores} ] \ + || [ ! ${executor_memory} ] \ + || [ ! ${master} ]; then + echo "Some values are NUll, please confirm with the property files" + exit 0 +fi +echo "${master_val}:${master}" +echo "${deploy_mode_val}:${deploy_mode}" +echo "${num_executors_val}:${num_executors}" +echo "${executor_cores_val}:${executor_cores}" +echo "${executor_memory_val}:${executor_memory}" +echo "${executor_extra_javaopts_val}:${executor_extra_javaopts}" +echo "${executor_emoryOverhead_val}:${executor_emoryOverhead}" + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +input_path=${!dataset_name} +output_path="${output_path_prefix}/kcore/${is_raw}/${dataset_name}" +echo "${dataset_name}: ${input_path},${output_path}" + +echo "start to clean exist output" +hdfs dfs -rm -r -f -skipTrash ${output_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs -- kcore-${dataset_name}" +if [ ${is_raw} == "no" ]; then + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.KCoreDecompositionRunner \ + --deploy-mode ${deploy_mode} \ + --driver-memory ${driver_memory} \ + --num-executors ${num_executors} \ + --executor-cores ${executor_cores} \ + --executor-memory ${executor_memory} \ + --conf "spark.executor.extraJavaOptions=${executor_extra_javaopts}" \ + --conf spark.driver.maxResultSize=200g \ + --conf spark.locality.wait.node=0 \ + --conf spark.executor.memoryOverhead=${executor_emoryOverhead} \ + --jars "lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${input_path} ${output_path} ${is_raw} ${cpu_name} | tee ./log/log +else + spark-submit \ + --class com.bigdata.graph.KCore \ + --deploy-mode ${deploy_mode} \ + --driver-memory ${driver_memory} \ + --num-executors ${num_executors} \ + --executor-cores ${executor_cores} \ + --executor-memory ${executor_memory} \ + --conf "spark.executor.extraJavaOptions=${executor_extra_javaopts}" \ + --conf spark.driver.maxResultSize=200g \ + --conf spark.locality.wait.node=0 \ + --conf spark.executor.memoryOverhead=${executor_emoryOverhead} \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${input_path} ${output_path} ${is_raw} ${cpu_name} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph/kcore_run_hive.sh b/tools/kal-test/bin/graph/kcore_run_hive.sh new file mode 100644 index 0000000..efc9572 --- /dev/null +++ b/tools/kal-test/bin/graph/kcore_run_hive.sh @@ -0,0 +1,57 @@ +#!/bin/bash +set -e + +case "$1" in +-h | --help | ?) + echo "Usage:

" + exit 0 + ;; +esac + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +if [ ${cpu_name} == "aarch64" ] +then + cpu_name="aarch_64" +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +table_name=$1 +col1=$2 +col2=$3 +partition=$4 +save_mode=$5 +save_arg=$6 + +echo "table_name: $table_name" +echo "col1: $col1" +echo "col2: $col2" +echo "partition: $partition" +echo "save_mode: $save_mode" +echo "save_arg: $save_arg" + +spark-submit \ +--class com.bigdata.graph.KCoreDecompositionHiveRunner \ +--deploy-mode "client" \ +--driver-memory "16g" \ +--num-executors 35 \ +--executor-cores 4 \ +--executor-memory "25g" \ +--conf spark.driver.maxResultSize=200g \ +--conf spark.locality.wait.node=0 \ +--conf spark.executor.memoryOverhead=10240 \ +--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ +--conf spark.rdd.compress=true \ +--conf spark.shuffle.compress=true \ +--conf spark.shuffle.spill.compress=true \ +--conf spark.io.compression.codec=lz4 \ +--jars "./lib/kal-test_${scala_version_val}-0.1.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--driver-class-path "lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--conf "spark.executor.extraClassPath=/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +./lib/kal-test_${scala_version_val}-0.1.jar ${table_name} ${col1} ${col2} ${partition} ${save_mode} ${save_arg} diff --git a/tools/kal-test/bin/graph/louvain_run.sh b/tools/kal-test/bin/graph/louvain_run.sh new file mode 100644 index 0000000..2e3421e --- /dev/null +++ b/tools/kal-test/bin/graph/louvain_run.sh @@ -0,0 +1,149 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of dataset: graph500_22,graph500_24,graph500_25,cit_patents,uk_2002,arabic_2005" + echo "2nd argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 2 ];then + alg_usage + exit 0 +fi + +source conf/graph/louvain/louvain_spark.properties + +dataset_name=$1 +is_raw=$2 + +if [ ${dataset_name} != "graph500_22" ] && + [ ${dataset_name} != "graph500_24" ] && + [ ${dataset_name} != "graph500_25" ] && + [ ${dataset_name} != "cit_patents" ] && + [ ${dataset_name} != "uk_2002" ] && + [ ${dataset_name} != "arabic_2005" ];then + echo "invalid dataset name,dataset name:graph500_22,graph500_24,graph500_25,cit_patents,uk_2002,arabic_20055" + exit 1 +fi + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +# concatnate strings as a new variable +num_executors="${dataset_name}_numExecutors_${cpu_name}" +executor_cores="${dataset_name}_executorCores_${cpu_name}" +executor_memory="${dataset_name}_executorMemory_${cpu_name}" +extra_java_options="${dataset_name}_extraJavaOptions_${cpu_name}" +num_partitions="${dataset_name}_numPartitions_${cpu_name}" +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +deploy_mode_val=${!deploy_mode} +num_partitions_val=${!num_partitions} + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${num_partitions} : ${num_partitions_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${extra_java_options_val} ] || + [ ! ${num_partitions_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +data_path_val=${!dataset_name} +output_path="${output_path_prefix}/louvain/${is_raw}/${dataset_name}" +echo "${dataset_name} : ${data_path_val}" +echo "output_path : ${output_path}" +hdfs dfs -rm -r -f ${output_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs -- louvain-${dataset_name}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.LouvainRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 16g \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf spark.scheduler.maxRegisteredResourcesWaitingTime=3600000 \ + --conf spark.worker.timeout=3600 \ + --conf spark.driver.maxResultSize=200g \ + --conf spark.rpc.askTimeout=36000 \ + --conf spark.rdd.compress=true \ + --conf spark.network.timeout=6000s \ + --conf spark.broadcast.blockSize=4m \ + --conf spark.shuffle.manager=SORT \ + --conf spark.shuffle.blockTransferService=nio \ + --conf spark.locality.wait.node=0 \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${num_partitions_val} ${is_raw} ${data_path_val} ${output_path} +else + community_output=${output_path}/community + modularity_output=${output_path}/modularity + + spark-submit \ + --class com.huawei.graph.algorithms.open.LouvainByGraphx \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 16g \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf spark.scheduler.maxRegisteredResourcesWaitingTime=3600000 \ + --conf spark.worker.timeout=3600 \ + --conf spark.driver.maxResultSize=200g \ + --conf spark.rpc.askTimeout=36000 \ + --conf spark.rdd.compress=true \ + --conf spark.network.timeout=6000s \ + --conf spark.broadcast.blockSize=4m \ + --conf spark.shuffle.manager=SORT \ + --conf spark.shuffle.blockTransferService=nio \ + --conf spark.locality.wait.node=0 \ + ./lib/louvain_2.11-0.1.0_open_sourced.jar yarn ${data_path_val} ${community_output} ${modularity_output} " " ${num_partitions_val} 2000 > louvain_temp.log + costTime=$(cat louvain_temp.log |grep "cost_time:" | awk '{print $2}') + modularity=$(cat louvain_temp.log |grep "modularity:" | awk '{print $2}') + currentTime=$(date "+%Y%m%d_H%M%S") + rm -rf louvain_temp.log + echo -e "algorithmName: Louvain\ncostTime: $costTime\ndatasetName: ${dataset_name}\nisRaw: 'yes'\nmodularity: ${modularity}\ntestcaseType: Louvain_opensource_${dataset_name}\n" > ./report/"Louvain_${currentTime}.yml" +fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph/louvain_run_hive.sh b/tools/kal-test/bin/graph/louvain_run_hive.sh new file mode 100644 index 0000000..c651725 --- /dev/null +++ b/tools/kal-test/bin/graph/louvain_run_hive.sh @@ -0,0 +1,69 @@ +#!/bin/bash +set -e + +case "$1" in +-h | --help | ?) + echo "Usage:
" + exit 0 + ;; +esac + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +if [ ${cpu_name} == "aarch64" ] +then + cpu_name="aarch_64" +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +table_name=$1 +col1=$2 +col2=$3 +colWeight=$4 +iterNum=$5 +isDirected=$6 +partition=$7 +save_mode=$8 +save_arg=$9 + +echo "table_name: $table_name" +echo "col1: $col1" +echo "colWeight: $colWeight" +echo "iterNum: $iterNum" +echo "isDirected: $isDirected" +echo "partition: $partition" +echo "save_mode: $save_mode" +echo "save_arg: $save_arg" + +spark-submit \ +--class com.bigdata.graph.LouvainHiveRunner \ +--master yarn \ +--deploy-mode "client" \ +--num-executors 35 \ +--executor-memory "25g" \ +--executor-cores 8 \ +--driver-memory "16g" \ +--conf spark.scheduler.maxRegisteredResourcesWaitingTime=3600000 \ +--conf spark.worker.timeout=3600 \ +--conf spark.driver.maxResultSize=200g \ +--conf spark.rpc.askTimeout=36000 \ +--conf spark.network.timeout=6000s \ +--conf spark.broadcast.blockSize=4m \ +--conf spark.shuffle.manager=SORT \ +--conf spark.shuffle.blockTransferService=nio \ +--conf spark.locality.wait.node=0 \ +--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ +--conf spark.rdd.compress=true \ +--conf spark.shuffle.compress=true \ +--conf spark.shuffle.spill.compress=true \ +--conf spark.io.compression.codec=lz4 \ +--jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +./lib/kal-test_${scala_version_val}-0.1.jar ${table_name} ${col1} ${col2} ${colWeight} ${iterNum} ${isDirected} ${partition} ${save_mode} ${save_arg} \ No newline at end of file diff --git a/tools/kal-test/bin/graph/lpa_run.sh b/tools/kal-test/bin/graph/lpa_run.sh new file mode 100644 index 0000000..b6ec3be --- /dev/null +++ b/tools/kal-test/bin/graph/lpa_run.sh @@ -0,0 +1,125 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of dataset: graph500_22,graph500_24,graph500_25" + echo "2nd argument: api: run,runConvergence" + echo "3rd argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 3 ];then + alg_usage + exit 0 +fi + +dataset_name=$1 +api=$2 +is_raw=$3 + +if [ $api != "run" ] && [ $api != "runConvergence" ]; +then + echo "invalid api." + echo "api: run or runConvergence" + exit 0 +fi + +if [ $dataset_name != 'graph500_22' ] && [ $dataset_name != 'graph500_24' ] && [ $dataset_name != 'graph500_25' ]; +then + echo 'invalid dataset' + echo "dataset name: graph500_22 or graph500_24 or graph500_25" + exit 0 +fi + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +source conf/graph/lpa/lpa_spark.properties +num_executors_val="numExecutors_${dataset_name}_${cpu_name}" +executor_cores_val="executorCores_${dataset_name}_${cpu_name}" +executor_memory_val="executorMemory_${dataset_name}_${cpu_name}" +executor_extra_javaopts_val="executorExtraJavaopts_${dataset_name}_${cpu_name}" + +master_val="master" +deploy_mode_val="deployMode" +driver_memory_val="driverMemory" +num_executors=${!num_executors_val} +executor_cores=${!executor_cores_val} +executor_memory=${!executor_memory_val} +master=${!master_val} +driver_memory=${!driver_memory_val} +deploy_mode=${!deploy_mode_val} +executor_extra_javaopts=${!executor_extra_javaopts_val} +if [ ! ${num_executors} ] \ + || [ ! ${executor_cores} ] \ + || [ ! ${executor_memory} ] \ + || [ ! ${master} ]; then + echo "Some values are NUll, please confirm with the property files" + exit 0 +fi +echo "${master_val}:${master}" +echo "${deploy_mode_val}:${deploy_mode}" +echo "${num_executors_val}:${num_executors}" +echo "${executor_cores_val}:${executor_cores}" +echo "${executor_memory_val}:${executor_memory}" +echo "${executor_extra_javaopts_val}:${executor_extra_javaopts}" + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +input_path=${!dataset_name} +output_path="${output_path_prefix}/lpa/${is_raw}/${dataset_name}/${api}" +echo "${dataset_name}: ${input_path},${output_path}" + +echo "start to clean exist output" +hdfs dfs -rm -r -f -skipTrash ${output_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs -- lpa-${api}-${dataset_name}" + +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.LabelPropagationRunner \ + --deploy-mode ${deploy_mode} \ + --driver-memory ${driver_memory} \ + --num-executors ${num_executors} \ + --executor-cores ${executor_cores} \ + --executor-memory ${executor_memory} \ + --conf "spark.executor.extraJavaOptions=${executor_extra_javaopts}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${input_path} ${output_path} ${api} ${is_raw} ${cpu_name} | tee ./log/log +else + spark-submit \ + --class com.bigdata.graph.LabelPropagationRunner \ + --deploy-mode ${deploy_mode} \ + --driver-memory ${driver_memory} \ + --num-executors ${num_executors} \ + --executor-cores ${executor_cores} \ + --executor-memory ${executor_memory} \ + --conf "spark.executor.extraJavaOptions=${executor_extra_javaopts}" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${input_path} ${output_path} "run" ${is_raw} ${cpu_name} | tee ./log/log +fi diff --git a/tools/kal-test/bin/graph/mce_run.sh b/tools/kal-test/bin/graph/mce_run.sh new file mode 100644 index 0000000..9646980 --- /dev/null +++ b/tools/kal-test/bin/graph/mce_run.sh @@ -0,0 +1,102 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage:" + echo "dataset name: graph500_23, graph500_24, graph500_25" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 1 ];then + alg_usage + exit 0 +fi + +dataset_name=$1 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +source conf/graph/mce/mce_spark.properties +num_executors_val="numExecutors_${cpu_name}" +executor_cores_val="executorCores" +executor_memory_val="executorMemory_${cpu_name}" +extra_java_options_val="extraJavaOptions_${cpu_name}" +deploy_mode_val="deployMode" +driver_memory_val="driverMemory" +num_executors=${!num_executors_val} +executor_cores=${!executor_cores_val} +executor_memory=${!executor_memory_val} +extra_java_options=${!extra_java_options_val} +driver_memory=${!driver_memory_val} +deploy_mode=${!deploy_mode_val} + +echo "${deploy_mode_val}:${deploy_mode}" +echo "${driver_memory_val}:${driver_memory}" +echo "${num_executors_val}:${num_executors}" +echo "${executor_cores_val}:${executor_cores}" +echo "${executor_memory_val}:${executor_memory}" +echo "${extra_java_options_val}:${extra_java_options}" +if [ ! ${num_executors} ] \ + || [ ! ${executor_cores} ] \ + || [ ! ${executor_memory} ] \ + || [ ! ${driver_memory} ] \ + || [ ! ${extra_java_options} ] \ + || [ ! ${deploy_mode} ]; then + echo "Some values are NUll, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +input_path=${!dataset_name} +output_path="${output_path_prefix}/mce/${dataset_name}" +echo "${dataset_name} : ${input_path}" +echo "outputPath : ${output_path}" +hdfs dfs -rm -r -f ${output_path} + +echo "start to clean exist output" +hdfs dfs -rm -r -f -skipTrash ${output_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ +scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ +scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + +echo "start to submit spark jobs -- mce-${dataset_name}" +spark-submit \ +--class com.bigdata.graph.MaximalCliqueEnumerationRunner \ +--deploy-mode ${deploy_mode} \ +--driver-memory ${driver_memory} \ +--num-executors ${num_executors} \ +--executor-cores ${executor_cores} \ +--executor-memory ${executor_memory} \ +--conf "spark.executor.extraJavaOptions=${extra_java_options}" \ +--conf spark.locality.wait=10 \ +--conf spark.rdd.compress=false \ +--conf spark.io.compression.codec=lz4 \ +--jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--driver-class-path "lib/fastutil-8.3.1.jar:lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${input_path} ${output_path} | tee ./log/log + + + + + diff --git a/tools/kal-test/bin/graph/mce_run_hive.sh b/tools/kal-test/bin/graph/mce_run_hive.sh new file mode 100644 index 0000000..59cc0f6 --- /dev/null +++ b/tools/kal-test/bin/graph/mce_run_hive.sh @@ -0,0 +1,64 @@ +#!/bin/bash +set -e +case "$1" in +-h | --help | ?) + echo "Usage:
" + exit 0 + ;; +esac + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +if [ ${cpu_name} == "aarch64" ] +then + cpu_name="aarch_64" +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +table_name=$1 +col1=$2 +col2=$3 +minK=$4 +maxDegree=$5 +partition=$6 +save_mode=$7 +save_arg=$8 + +echo "table_name: $table_name" +echo "col1: $col1" +echo "col2: $col2" +echo "minK: $minK" +echo "maxDegree: $maxDegree" +echo "partition: $partition" +echo "save_mode: $save_mode" +echo "save_arg: $save_arg" + +spark-submit \ +--class com.bigdata.graph.MaximalCliqueEnumerationHiveRunner \ +--master yarn \ +--deploy-mode "client" \ +--driver-memory "80g" \ +--num-executors 59 \ +--executor-cores 4 \ +--executor-memory "15g" \ +--conf spark.locality.wait=10 \ +--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ +--conf spark.rdd.compress=true \ +--conf spark.shuffle.compress=true \ +--conf spark.shuffle.spill.compress=true \ +--conf spark.io.compression.codec=lz4 \ +--jars "lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--conf "spark.executor.extraClassPath=/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +./lib/kal-test_${scala_version_val}-0.1.jar ${table_name} ${col1} ${col2} ${minK} ${maxDegree} ${partition} ${save_mode} ${save_arg} + + + + + diff --git a/tools/kal-test/bin/graph/modularity_run.sh b/tools/kal-test/bin/graph/modularity_run.sh new file mode 100644 index 0000000..168e1ee --- /dev/null +++ b/tools/kal-test/bin/graph/modularity_run.sh @@ -0,0 +1,120 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "please input 1 argument: " + echo "1st argument: name of dataset: graph500_23, graph500_25, graph500_26, uk_2002, arabic_2005, twitter" + echo "2nd argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 2 ];then + alg_usage + exit 0 +fi + +dataset_name=$1 +is_raw=$2 + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +source conf/graph/modularity/modularity_spark.properties +# concatnate strings as a new variable +num_executors="${dataset_name}_numExectuors_${cpu_name}" +executor_cores="${dataset_name}_executorCores_${cpu_name}" +executor_memory="${dataset_name}_executorMemory_${cpu_name}" +extra_java_options="${dataset_name}_extraJavaOptions_${cpu_name}" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${extra_java_options_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +data_path_val=${!dataset_name} +input_community="${dataset_name}_community" +input_community_val=${!input_community} +echo "${dataset_name} : ${data_path_val}" +echo "input_community : ${input_community_val}" + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs -- modularity-${dataset_name}" + +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.ModularityRunner \ + --driver-memory 80g \ + --master yarn \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${data_path_val} ${input_community_val} | tee ./log/log +else + spark-submit \ + --class com.huawei.graph.algorithms.ModularityComputeByNovel \ + --master yarn \ + --deploy-mode client \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 200g \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf spark.scheduler.maxRegisteredResourcesWaitingTime=3600000 \ + --conf spark.rpc.message.maxSize=2046 \ + --conf spark.worker.timeout=3600 \ + --conf spark.driver.maxResultSize=200g \ + --conf spark.shuffle.manager=SORT \ + --conf spark.broadcast.blockSize=4m \ + --conf spark.locality.wait.node=0 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.kryoserializer.buffer=48m \ + --conf spark.core.connection.ack.wait.timeout=60000s \ + --conf spark.storage.memoryFraction=0.2 \ + --conf spark.shuffle.memoryFraction=0.6 \ + --conf spark.rdd.compress=true \ + ./lib/modularity_2.11-0.1.0_open.jar yarn ${data_path_val} ${input_community_val} " " "," 500 false false > modularity_temp.log + costTime=$(cat modularity_temp.log |grep "cost_time:" | awk '{print $2}') + modularity=$(cat modularity_temp.log |grep "modularity:" | awk '{print $2}') + currentTime=$(date "+%Y%m%d_H%M%S") + rm -rf modularity_temp.log + echo -e "algorithmName: Modularity\ncostTime: $costTime\ndatasetName: ${dataset_name}\nisRaw: 'yes'\nModularity: ${modularity}\ntestcaseType: Modularity_opensource_${dataset_name}\n" > ./report/"Modularity_${currentTime}.yml" +fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph/mssp_run.sh b/tools/kal-test/bin/graph/mssp_run.sh new file mode 100644 index 0000000..92dd056 --- /dev/null +++ b/tools/kal-test/bin/graph/mssp_run.sh @@ -0,0 +1,114 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of dataset: soc_liveJournal,uk_2002,arabic_2005" + echo "2nd argument: source number: 5/50" + echo "3rd argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 3 ];then + alg_usage + exit 0 +fi + +dataset_name=$1 +source_num=$2 +is_raw=$3 + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +# concatnate strings as a new variable +source conf/graph/mssp/mssp_spark.properties +num_executors_val=${numExectuors} +executor_cores_val=${executorCores} +executor_memory_val=${executorMemory} +driver_memory_val=${driverMemory} +extra_java_options_val=${extraJavaOptions} +compute_partition_val=${computePartition} +split=${splitGraph} + +echo "numExectuors : ${num_executors_val}" +echo "executorCores: ${executor_cores_val}" +echo "executorMemory : ${executor_memory_val}" +echo "driverMemory : ${driver_memory_val}" +echo "extraJavaOptions : ${extra_java_options_val}" +echo "computePartition : ${compute_partition_val}" +echo "splitGraph : ${split}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${driver_memory_val} ] || + [ ! ${extra_java_options_val} ] || + [ ! ${compute_partition_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${split} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +data_path_val=${!dataset_name} +source_path=${dataset_name}_${source_num} +source_path_val=${!source_path} +output_path="${output_path_prefix}/mssp/${is_raw}/${dataset_name}_${source_num}" +hdfs dfs -rm -r -f ${output_path} + +echo "${dataset_name} : ${data_path_val}" +echo "${source_path} : ${source_path_val}" +echo "outputPath : ${output_path}" + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs -- mssp-${dataset_name0}-${source_num}" +if [ ${is_raw} == "no" ]; then + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.MSSPRunner \ + --master yarn \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory ${driver_memory_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf spark.driver.maxResultSize=100g \ + --jars "lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name}_${source_num} ${compute_partition_val} ${data_path_val} ${output_path} ${source_path_val} ${split} ${is_raw} | tee ./log/log +else + spark-submit \ + --class com.bigdata.graph.MSSPRunner \ + --master yarn \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory ${driver_memory_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf spark.driver.maxResultSize=100g \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name}_${source_num} ${compute_partition_val} ${data_path_val} ${output_path} ${source_path_val} ${split} ${is_raw} | tee ./log/log +fi diff --git a/tools/kal-test/bin/graph/node2vec_run.sh b/tools/kal-test/bin/graph/node2vec_run.sh new file mode 100644 index 0000000..eeaf4cd --- /dev/null +++ b/tools/kal-test/bin/graph/node2vec_run.sh @@ -0,0 +1,161 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of dataset: cit_patents,soc_liveJournal,uk_2002" + echo "2nd argument: optimization algorithm or raw: no/yes" + echo "3rd argument: verify result: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 3 ];then + alg_usage + exit 0 +fi + +source conf/graph/node2vec/node2vec_spark.properties + +dataset_name=$1 +is_raw=$2 +is_check=$3 + +if [ ${dataset_name} != "cit_patents" ] && + [ ${dataset_name} != "soc_liveJournal" ] && + [ ${dataset_name} != "uk_2002" ] ;then + echo "invalid dataset name,dataset name:cit_patents,soc_liveJournal,uk_2002" + exit 1 +fi + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${cpu_name} + +# concatnate strings as a new variable +master_="master" +deploy_mode="deployMode" +driver_memory="driverMemory" +driver_cores="driverCores_${cpu_name}" +executor_cores="executorCores_${cpu_name}" +executor_memory="executorMemory_${cpu_name}" +num_executors="numExecutors_${cpu_name}" + +master_val=${!master_} +deploy_mode_val=${!deploy_mode} +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +driver_memory_val=${!driver_memory} +driver_cores_val=${!driver_cores} + +echo "${cpu_name}" +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${driver_memory}:${driver_memory_val}" +echo "${driver_cores}:${driver_cores_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${driver_memory_val} ] || + [ ! ${driver_cores_val} ] ; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +gt_path="${dataset_name}_negEdge" +data_path_val=${!dataset_name} +gt_path_val=${!gt_path} +output_path="${output_path_prefix}/node2vec/${is_raw}/${dataset_name}" +echo "${dataset_name} : ${data_path_val}" +echo "output_path : ${output_path}" +echo "gt_path : ${gt_path_val}" +hdfs dfs -rm -r -f ${output_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs--node2vec_${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.Node2VecRunner \ + --master ${master_val} \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory ${driver_memory_val} \ + --driver-cores ${driver_cores_val} \ + --conf spark.kryoserializer.buffer.max=2047m \ + --conf spark.ui.showConsoleProgress=true \ + --conf spark.driver.maxResultSize=0 \ + --conf spark.driver.extraJavaOptions="-Xms300G -XX:hashCode=0" \ + --conf spark.executor.extraJavaOption="-Xms315G -XX:hashCode=0" \ + --conf spark.rpc.askTimeout=1000000s \ + --conf spark.network.timeout=1000000s \ + --conf spark.executor.heartbeatInterval=100000s \ + --conf spark.rpc.message.maxSize=1000 \ + --jars "lib/smile-core-2.5.3.jar,lib/smile-math-2.5.3.jar,lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar,lib/boostkit-graph-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${output_path} ${gt_path_val} ${is_check} | tee ./log/log +else + spark-submit \ + --class vn.five9.Main \ + --master ${master_val} \ + --name "Node2Vec_${model_conf}" \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory ${driver_memory_val} \ + --driver-cores ${driver_cores_val} \ + --conf spark.kryoserializer.buffer.max=2047m \ + --conf spark.ui.showConsoleProgress=true \ + --conf spark.driver.maxResultSize=0 \ + --conf spark.driver.extraJavaOptions="-Xms300G -XX:hashCode=0" \ + --conf spark.executor.extraJavaOption="-Xms315G -XX:hashCode=0" \ + --conf spark.rpc.askTimeout=1000000s \ + --conf spark.network.timeout=1000000s \ + --conf spark.executor.heartbeatInterval=100000s \ + --conf spark.rpc.message.maxSize=1000 \ + --jars "lib/fastutil-8.3.1.jar,lib/spark-mllib_2.11-2.3.2.jar,lib/smile-core-2.5.3.jar,lib/scopt_2.11-3.5.0.jar,lib/smile-math-2.5.3.jar" \ + ./lib/node2vec-baseline.jar \ + --cmd node2vec --indexed true --directed true --degree 1000000000 \ + --p 1.0 --q 1.0 --walkLength 5 --numWalks 10 \ + --input ${data_path_val} --output ${output_path} > node2vec_tmp.log + + CostTime=$(cat node2vec_tmp.log |grep "total time" | awk '{print $7}') + currentTime=$(date "+%Y%m%d_H%M%S") + rm -rf node2vec_tmp.log + echo -e "algorithmName: Node2vec\ncostTime: $CostTime\ndatasetName: ${dataset_name}\nisRaw: 'yes'\ntestcaseType: Node2vec_opensource_${dataset_name}\n" > ./report/"Node2vec_${currentTime}.yml" + if [ $? -eq 0 ];then + echo "Exec Successful: end." > ./log/log + else + echo "Exec Failure: please check the code" > ./log/log + fi +fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph/ppr_run.sh b/tools/kal-test/bin/graph/ppr_run.sh new file mode 100644 index 0000000..09649a0 --- /dev/null +++ b/tools/kal-test/bin/graph/ppr_run.sh @@ -0,0 +1,206 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of dataset: cit_patents,uk_2002,arabic_2005" + echo "2nd argument: name of api: fixMS,fixSS,conSS" + echo "3rd argument: optimization algorithm or raw: no/yes" + echo "4th argument: sourceCnt or null: 1,5,10,50,100" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 3 ] && [ $# -ne 4 ];then + alg_usage + exit 0 +fi + +source conf/graph/ppr/ppr_spark.properties + +dataset_name=$1 +api_name=$2 +is_raw=$3 + +if [ ${dataset_name} != "cit_patents" ] && + [ ${dataset_name} != "uk_2002" ] && + [ ${dataset_name} != "arabic_2005" ] ;then + echo "invalid dataset name,dataset name:cit_patents,uk_2002,arabic_2005" + exit 1 +fi +if [ ${api_name} != "fixMS" ] && + [ ${api_name} != "fixSS" ] && + [ ${api_name} != "conSS" ] ;then + echo "invalid argument value,api name: fixMS,fixSS,conSS" + exit 1 +fi + +if [ $# -eq 4 ]; then + src=$4 + if [ ${src} != "1" ] && + [ ${src} != "5" ] && + [ ${src} != "10" ] && + [ ${src} != "50" ] && + [ ${src} != "100" ] ;then + echo "invalid argument value,must be: 1, 5, 10, 50 or 100" + exit 1 + fi +fi + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +# concatnate strings as a new variable +num_executors="${api_name}_${dataset_name}_numExecutors_${cpu_name}" +executor_cores="${api_name}_${dataset_name}_executorCores_${cpu_name}" +executor_memory="${api_name}_${dataset_name}_executorMemory_${cpu_name}" +num_partitions="${api_name}_${dataset_name}_numPartitions_${cpu_name}" +extra_Java_Options="${api_name}_${dataset_name}_extraJavaOptions_${cpu_name}" +deploy_mode="deployMode" +driver_memory="driverMemory" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +deploy_mode_val=${!deploy_mode} +num_partitions_val=${!num_partitions} +driver_memory_val=${!driver_memory} +extra_Java_Options_val=${!extra_Java_Options} + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${num_partitions} : ${num_partitions_val}" +echo "${driver_memory}:${driver_memory_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${extra_Java_Options_val} ] || + [ ! ${num_partitions_val} ] ; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + +function clean_cache() { + echo "start to clean cache and sleep 30s" + ssh server1 "echo 3 > /proc/sys/vm/drop_caches" + ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" + ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" + ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" + sleep 30 +} + +if [ ${is_raw} == "no" ]; then + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ +fi + +if [ ${api_name} == "fixMS" ]; then + output_path="${output_path_prefix}/ppr/${is_raw}/${dataset_name}/${api_name}_${src}" + hdfs dfs -rm -r -f ${output_path} + clean_cache + echo "start to submit spark jobs -- ppr-${api_name}_${dataset_name}_${src}" + if [ ${is_raw} == "no" ]; then + spark-submit \ + --class com.bigdata.graph.PersonalizedPageRankRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory ${driver_memory_val} \ + --conf spark.executor.extraJavaOptions=${extra_Java_Options_val} \ + --conf spark.driver.maxResultSize=80g \ + --conf spark.driver.extraJavaOptions="-Xms80G" \ + --conf spark.locality.wait.node=0 \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${api_name} ${num_partitions_val} ${data_path_val} ${is_raw} ${src} ${output_path} | tee ./log/log + else + spark-submit \ + --class com.bigdata.graph.PersonalizedPageRankRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory ${driver_memory_val} \ + --conf spark.executor.extraJavaOptions=${extra_Java_Options_val} \ + --conf spark.driver.maxResultSize=80g \ + --conf spark.driver.extraJavaOptions="-Xms80G" \ + --conf spark.locality.wait.node=0 \ + --jars "lib/fastutil-8.3.1.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar" \ + --conf "spark.executor.extraClassPath=fastutil-8.3.1.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${api_name} ${num_partitions_val} ${data_path_val} ${is_raw} ${src} ${output_path} | tee ./log/log + fi +else + source conf/graph/ppr/ppr_source_id.properties + IFS=, + source_ids="${dataset_name}_SourceID" + source_ids_val=${!source_ids} + source_ids_arr=($source_ids_val) + + echo "${source_ids}:${source_ids_val}" + for source_id in ${source_ids_arr[@]} + do + output_path="${output_path_prefix}/ppr/${is_raw}/${dataset_name}/${api_name}_${source_id}" + hadoop fs -rm -r -f ${output_path} + clean_cache + echo "start to submit spark jobs -- ppr-${api_name}_${dataset_name}_${source_id}" + if [ ${is_raw} == "no" ]; then + spark-submit \ + --class com.bigdata.graph.PersonalizedPageRankRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory ${driver_memory_val} \ + --conf spark.executor.extraJavaOptions=${extra_Java_Options_val} \ + --conf spark.driver.maxResultSize=80g \ + --conf spark.driver.extraJavaOptions="-Xms80G" \ + --conf spark.locality.wait.node=0 \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${api_name} ${num_partitions_val} ${data_path_val} ${is_raw} ${source_id} ${output_path} | tee ./log/log + else + spark-submit \ + --class com.bigdata.graph.PersonalizedPageRankRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory ${driver_memory_val} \ + --conf spark.executor.extraJavaOptions=${extra_Java_Options_val} \ + --conf spark.driver.maxResultSize=80g \ + --conf spark.driver.extraJavaOptions="-Xms80G" \ + --conf spark.locality.wait.node=0 \ + --jars "lib/fastutil-8.3.1.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar" \ + --conf "spark.executor.extraClassPath=fastutil-8.3.1.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${api_name} ${num_partitions_val} ${data_path_val} ${is_raw} ${source_id} ${output_path} | tee ./log/log + fi + done +fi diff --git a/tools/kal-test/bin/graph/pr_run.sh b/tools/kal-test/bin/graph/pr_run.sh new file mode 100644 index 0000000..7261819 --- /dev/null +++ b/tools/kal-test/bin/graph/pr_run.sh @@ -0,0 +1,134 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: cit_patents,uk_2002,arabic_2005" + echo "2nd argument: name of api: run,runUntilConvergence" + echo "3rd argument: optimization algorithm or raw: no/yes" +} +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 3 ];then + usage + exit 0 +fi + +source conf/graph/pr/pr_spark.properties + +dataset_name=$1 +api_name=$2 +is_raw=$3 + +if [ ${dataset_name} != "cit_patents" ] && [ ${dataset_name} != "uk_2002" ] && [ ${dataset_name} != "arabic_2005" ];then + echo "invalid dataset name,dataset name:cit_patents,or uk_2002,or arabic_2005" + exit 1 +fi +if [ ${api_name} != "run" ] && [ ${api_name} != "runUntilConvergence" ];then + echo "invalid api name,api name: run or runUntilConvergence" + exit 1 +fi + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +prefix="run" +if [ ${api_name} == "runUntilConvergence" ] +then + prefix="convergence" +fi + +# concatnate strings as a new variable +num_executors="${prefix}_${dataset_name}_numExecutors_${cpu_name}" +executor_cores="${prefix}_${dataset_name}_executorCores_${cpu_name}" +executor_memory="${prefix}_${dataset_name}_executorMemory_${cpu_name}" +extra_java_options="${prefix}_${dataset_name}_extraJavaOptions_${cpu_name}" +num_partitions="${prefix}_${dataset_name}_numPartitions_${cpu_name}" +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +deploy_mode_val=${!deploy_mode} +num_partitions_val=${!num_partitions} + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${num_partitions} : ${num_partitions_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${extra_java_options_val} ] || + [ ! ${num_partitions_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +data_path_val=${!dataset_name} +output_path="${output_path_prefix}/pr/${is_raw}/${dataset_name}_${api_name}" +echo "${dataset_name} : ${data_path_val}" +echo "output_path : ${output_path}" +hdfs dfs -rm -r -f ${output_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs -- pr-${dataset_name}-${api_name}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.PageRankRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 100g \ + --conf spark.driver.maxResultSize=200g \ + --conf spark.driver.extraJavaOptions="-Xms100G" \ + --conf spark.locality.wait.node=0 \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${api_name} ${num_partitions_val} ${is_raw} ${data_path_val} ${output_path} | tee ./log/log +else + spark-submit \ + --class com.bigdata.graph.PageRankRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 100g \ + --conf spark.driver.maxResultSize=200g \ + --conf spark.driver.extraJavaOptions="-Xms100G" \ + --conf spark.locality.wait.node=0 \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --driver-class-path "lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${api_name} ${num_partitions_val} ${is_raw} ${data_path_val} ${output_path} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph/pr_run_hive.sh b/tools/kal-test/bin/graph/pr_run_hive.sh new file mode 100644 index 0000000..81f4bb3 --- /dev/null +++ b/tools/kal-test/bin/graph/pr_run_hive.sh @@ -0,0 +1,66 @@ +#!/bin/bash +set -e + +case "$1" in +-h | --help | ?) + echo "Usage:
" + exit 0 + ;; +esac + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +if [ ${cpu_name} == "aarch64" ] +then + cpu_name="aarch_64" +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +table_name=$1 +col1=$2 +col2=$3 +api=$4 +tol=$5 +resetProb=$6 +numIter=$7 +partition=$8 +save_mode=$9 +save_arg=${10} + +echo "table_name: $table_name" +echo "col1: $col1" +echo "col2: $col2" +echo "api: $api" +echo "tol: $tol" +echo "resetProb: $resetProb" +echo "numIter: $numIter" +echo "partition: $partition" +echo "save_mode: $save_mode" +echo "save_arg: $save_arg" + +spark-submit \ +--class com.bigdata.graph.PageRankHiveRunner \ +--master yarn \ +--deploy-mode "client" \ +--num-executors 36 \ +--executor-memory "25g" \ +--executor-cores 4 \ +--driver-memory 100g \ +--conf spark.driver.maxResultSize=200g \ +--conf spark.driver.extraJavaOptions="-Xms100G" \ +--conf spark.locality.wait.node=0 \ +--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ +--conf spark.rdd.compress=true \ +--conf spark.shuffle.compress=true \ +--conf spark.shuffle.spill.compress=true \ +--conf spark.io.compression.codec=lz4 \ +--jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +./lib/kal-test_${scala_version_val}-0.1.jar ${table_name} ${col1} ${col2} ${api} ${tol} ${resetProb} ${numIter} ${partition} ${save_mode} ${save_arg} diff --git a/tools/kal-test/bin/graph/scc_run.sh b/tools/kal-test/bin/graph/scc_run.sh new file mode 100644 index 0000000..5c59caa --- /dev/null +++ b/tools/kal-test/bin/graph/scc_run.sh @@ -0,0 +1,149 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of dataset: cit_patents,enwiki_2018,arabic_2005" + echo "2nd argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 2 ];then + alg_usage + exit 0 +fi + +dataset_name=$1 +is_raw=$2 + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +num_executors_val="numExecutors_${cpu_name}" +executor_cores_val="executorCores_${cpu_name}" +executor_memory_val="executorMemory_${cpu_name}" +if [ ${dataset_name} == "arabic_2005" ] +then + num_executors_val="numExecutors_${cpu_name}_arabic_2005" + executor_cores_val="executorCores_${cpu_name}_arabic_2005" + executor_memory_val="executorMemory_${cpu_name}_arabic_2005" +fi +master_val="master" +deploy_mode_val="deployMode" +driver_memory_val="driverMemory" +source conf/graph/scc/scc_spark.properties +num_executors=${!num_executors_val} +executor_cores=${!executor_cores_val} +executor_memory=${!executor_memory_val} +master=${!master_val} +driver_memory=${!driver_memory_val} +deploy_mode=${!deploy_mode_val} +if [ ! ${num_executors} ] \ + || [ ! ${executor_cores} ] \ + || [ ! ${executor_memory} ] \ + || [ ! ${master} ]; then + echo "Some values are NUll, please confirm with the property files" + exit 0 +fi +echo "${master_val}:${master}" +echo "${deploy_mode_val}:${deploy_mode}" +echo "${num_executors_val}:${num_executors}" +echo "${executor_cores_val}:${executor_cores}" +echo "${executor_memory_val}:${executor_memory}" +echo "executor_extra_javaopts:${executor_extra_javaopts}" + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +input_path=${!dataset_name} +output_path="${output_path_prefix}/scc/${is_raw}/${dataset_name}" +echo "${dataset_name}: ${input_path},${output_path}" + +echo "start to clean exist output" +hdfs dfs -rm -r -f -skipTrash ${output_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs -- scc-${dataset_name}" +if [ ${is_raw} == "no" ]; then + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.StronglyConnectedComponentsRunner \ + --deploy-mode ${deploy_mode} \ + --driver-memory ${driver_memory} \ + --num-executors ${num_executors} \ + --executor-cores ${executor_cores} \ + --executor-memory ${executor_memory} \ + --conf spark.locality.wait.node=0 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.driver.maxResultSize=100g \ + --conf spark.ui.showConsoleProgress=false \ + --conf spark.driver.extraJavaOptions="-Xms12g -XX:hashCode=0" \ + --conf spark.executor.extraJavaOptions="${executor_extra_javaopts}" \ + --conf spark.rpc.askTimeout=1000000s \ + --conf spark.network.timeout=1000000s \ + --conf spark.executor.heartbeatInterval=100000s \ + --conf spark.rpc.message.maxSize=1000 \ + --conf spark.memory.fraction=0.24939583270092516 \ + --conf spark.memory.storageFraction=0.5849745294783253 \ + --conf spark.broadcast.blockSize=1m \ + --conf spark.reducer.maxSizeInFlight=59mb \ + --conf spark.shuffle.file.buffer=17k \ + --conf spark.io.compression.codec=lzf \ + --conf spark.shuffle.compress=true \ + --conf spark.rdd.compress=false \ + --conf spark.shuffle.io.preferDirectBufs=true \ + --conf spark.shuffle.spill.compress=true \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${input_path} ${output_path} "run" ${is_raw} ${cpu_name} 300 | tee ./log/log +else + spark-submit \ + --class com.bigdata.graph.StronglyConnectedComponentsRunner \ + --deploy-mode ${deploy_mode} \ + --driver-memory ${driver_memory} \ + --num-executors ${num_executors} \ + --executor-cores ${executor_cores} \ + --executor-memory ${executor_memory} \ + --conf spark.locality.wait.node=0 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.driver.maxResultSize=100g \ + --conf spark.ui.showConsoleProgress=false \ + --conf spark.driver.extraJavaOptions="-Xms12g -XX:hashCode=0" \ + --conf spark.executor.extraJavaOptions="${executor_extra_javaopts}" \ + --conf spark.rpc.askTimeout=1000000s \ + --conf spark.network.timeout=1000000s \ + --conf spark.executor.heartbeatInterval=100000s \ + --conf spark.rpc.message.maxSize=1000 \ + --conf spark.memory.fraction=0.24939583270092516 \ + --conf spark.memory.storageFraction=0.5849745294783253 \ + --conf spark.broadcast.blockSize=1m \ + --conf spark.reducer.maxSizeInFlight=59mb \ + --conf spark.shuffle.file.buffer=17k \ + --conf spark.io.compression.codec=lzf \ + --conf spark.shuffle.compress=true \ + --conf spark.rdd.compress=false \ + --conf spark.shuffle.io.preferDirectBufs=true \ + --conf spark.shuffle.spill.compress=true \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${input_path} ${output_path} "run" ${is_raw} ${cpu_name} 400 | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph/sgm_run.sh b/tools/kal-test/bin/graph/sgm_run.sh new file mode 100644 index 0000000..d2f5c36 --- /dev/null +++ b/tools/kal-test/bin/graph/sgm_run.sh @@ -0,0 +1,237 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of dataset: name of dataset: graph500_19,liveJournal,com_orkut" + echo "2nd argument: name of queryGraph: for Identical: 4dgn/4sqr/5tree/6star; for unIdentical: 4dgn/4clique/5clique/6clique" + echo "3rd argument: match mode:Identical,unIdentical" + echo "4th argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 4 ];then + alg_usage + exit 0 +fi + +dataset_name=$1 +queryGraph=$2 +match_mode=$3 +is_raw=$4 + +if [ ${dataset_name} != "graph500_19" ] && + [ ${dataset_name} != "liveJournal" ] && + [ ${dataset_name} != "com_orkut" ] ;then + echo "invalid dataset name,dataset name:graph500_19,liveJournal,com_orkut" + exit 1 +fi +if [ ${match_mode} != "Identical" ] && + [ ${match_mode} != "unIdentical" ] ;then + echo "invalid argument value,match mode:identical or unidentical" + exit 1 +fi +if [ ${match_mode} == "Identical" ] ; then + if [ ${queryGraph} != "4dgn" ] && + [ ${queryGraph} != "4sqr" ] && + [ ${queryGraph} != "5tree" ] && + [ ${queryGraph} != "6star" ] ; then + echo "invalid queryGraph,queryGraph name:4dgn,4sqr,5tree,6star" + exit 1 + fi +elif [ ${match_mode} == "unIdentical" ]; then + if [ ${queryGraph} != "4dgn" ] && + [ ${queryGraph} != "4clique" ] && + [ ${queryGraph} != "5clique" ] && + [ ${queryGraph} != "6clique" ] ; then + echo "invalid queryGraph,queryGraph name:4dgn,4clique,5clique,6clique" + exit 1 + fi +fi + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +# concatnate strings as a new variable +source conf/graph/sgm/sgm_spark.properties +num_executors="${dataset_name}_${queryGraph}_${match_mode}_numExecutors_${cpu_name}" +executor_cores="${dataset_name}_${queryGraph}_${match_mode}_executorCores_${cpu_name}" +executor_memory="${dataset_name}_${queryGraph}_${match_mode}_executorMemory_${cpu_name}" +num_partitions="${dataset_name}_${queryGraph}_${match_mode}_numPartitions_${cpu_name}" +extra_Java_Options="${dataset_name}_${queryGraph}_${match_mode}_executorExtraJavaOptions_${cpu_name}" +num_Task="${dataset_name}_${queryGraph}_${match_mode}_numberTask_${cpu_name}" +deploy_mode="deployMode" +driver_memory="driverMemory" +rpc_askTime="rpcAskTime" +scheduler_maxRegisteredResourcesWaitingTime="schedulerMaxRegisteredResourcesWaitingTime" +worker_timeout="workerTimeout" +network_timeout="networkTimeout" +storage_blockManagerSlaveTimeoutMs="storageBlockManagerSlaveTimeoutMs" +shuffle_blockTransferService="shuffleBlockTransferService" +driver_maxResultSize="driverMaxResultSize" +shuffle_manager="shuffleManager" +broadcast_blockSize="broadcastBlockSize" +rpc_message_maxSize="rpcMessageMaxSize" +core_connection_ack_wait_timeout="coreConnectionAckWaitTimeout" +storage_memoryFraction="storageMemoryFraction" +shuffle_memoryFraction="shuffleMemoryFraction" +rdd_compress="rddCompress" +memory_useLegacyMode="memoryUseLegacyMode" +num_Colors="numberColors" +graph_Split="${dataset_name}_split" + +if [ ${is_raw} == "yes" ]; then + num_executors="numExecutors" + executor_cores="executorCores" + executor_memory="executorMemory" + num_partitions="numPartitions" + extra_Java_Options="executorExtraJavaOptions" +fi + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +num_task_val=${!num_Task} +deploy_mode_val=${!deploy_mode} +num_partitions_val=${!num_partitions} +extra_Java_Options_val=${!extra_Java_Options} +driver_memory_val=${!driver_memory} +rpc_askTime_val=${!rpc_askTime} +scheduler_maxRegisteredResourcesWaitingTime_val=${!scheduler_maxRegisteredResourcesWaitingTime} +worker_timeout_val=${!worker_timeout} +network_timeout_val=${!network_timeout} +storage_blockManagerSlaveTimeoutMs_val=${!storage_blockManagerSlaveTimeoutMs} +shuffle_blockTransferService_val=${!shuffle_blockTransferService} +driver_maxResultSize_val=${!driver_maxResultSize} +shuffle_manager_val=${!shuffle_manager} +broadcast_blockSize_val=${!broadcast_blockSize} +rpc_message_maxSize_val=${!rpc_message_maxSize} +core_connection_ack_wait_timeout_val=${!core_connection_ack_wait_timeout} +storage_memoryFraction_val=${!storage_memoryFraction} +shuffle_memoryFraction_val=${!shuffle_memoryFraction} +rdd_compress_val=${!rdd_compress} +memory_useLegacyMode_val=${!memory_useLegacyMode} +num_colors_val=${!num_Colors} +graph_split_val=${!graph_Split} + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${num_partitions} : ${num_partitions_val}" +echo "${driver_memory}:${driver_memory_val}" +echo "${extra_Java_Options}:${extra_Java_Options_val}" +echo "${num_Task}:${num_task_val}" +echo "${num_Colors}:${num_colors_val}" +echo "${graph_Split}:${graph_split_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${num_partitions_val} ] || + [ ! ${num_task_val} ] || + [ ! ${num_colors_val} ] || + [ ! ${graph_split_val} ] || + [ ! ${extra_Java_Options_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +data_path_val=${!dataset_name} +queryGraph_path="query_${queryGraph}" +queryGraph_path_val=${!queryGraph_path} +echo "${dataset_name} : ${data_path_val}" +echo "${queryGraph_path} : ${queryGraph_path_val}" + +output_path="${output_path_prefix}/sgm/${is_raw}/${dataset_name}_${queryGraph}_${match_mode}" +hdfs dfs -rm -r -f ${output_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs--SGM_${dataset_name}_${queryGraph}_${match_mode}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.SubgraphMatchingRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory ${driver_memory_val} \ + --conf spark.executor.extraJavaOptions=${extra_Java_Options_val} \ + --conf spark.rpc.askTime=${rpc_askTime_val} \ + --conf spark.scheduler.maxRegisteredResourcesWaitingTime=${scheduler_maxRegisteredResourcesWaitingTime_val} \ + --conf spark.worker.timeout=${worker_timeout_val} \ + --conf spark.network.timeout=${network_timeout_val} \ + --conf spark.storage.blockManagerSlaveTimeoutMs=${storage_blockManagerSlaveTimeoutMs_val} \ + --conf spark.shuffle.blockTransferService=${shuffle_blockTransferService_val} \ + --conf spark.driver.maxResultSize=${driver_maxResultSize_val} \ + --conf spark.shuffle.manager=${shuffle_manager_val} \ + --conf spark.broadcast.blockSize=${broadcast_blockSize_val} \ + --conf spark.rpc.message.maxSize=${rpc_message_maxSize_val} \ + --conf spark.core.connection.ack.wait.timeout=${core_connection_ack_wait_timeout_val} \ + --conf spark.storage.memoryFraction=${storage_memoryFraction_val} \ + --conf spark.shuffle.memoryFraction=${shuffle_memoryFraction_val} \ + --conf spark.rdd.compress=${rdd_compress_val} \ + --conf spark.memory.useLegacyMode=${memory_useLegacyMode_val} \ + --conf spark.executor.memoryOverhead=5g \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${queryGraph} ${is_raw} ${match_mode} ${output_path} ${data_path_val} ${num_partitions_val} ${num_task_val} ${queryGraph_path_val} | tee ./log/log +else + spark-submit \ + --class pegasus.spark.subgraph.TestOriginal \ + --name "SGM_${dataset_name}_${queryGraph}_opensource" \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory ${driver_memory_val} \ + --conf spark.executor.extraJavaOptions=${extra_Java_Options_val} \ + --conf spark.rpc.askTime=${rpc_askTime_val} \ + --conf spark.scheduler.maxRegisteredResourcesWaitingTime=${scheduler_maxRegisteredResourcesWaitingTime_val} \ + --conf spark.worker.timeout=${worker_timeout_val} \ + --conf spark.network.timeout=${network_timeout_val} \ + --conf spark.storage.blockManagerSlaveTimeoutMs=${storage_blockManagerSlaveTimeoutMs_val} \ + --conf spark.shuffle.blockTransferService=${shuffle_blockTransferService_val} \ + --conf spark.driver.maxResultSize=${driver_maxResultSize_val} \ + --conf spark.shuffle.manager=${shuffle_manager_val} \ + --conf spark.broadcast.blockSize=${broadcast_blockSize_val} \ + --conf spark.rpc.message.maxSize=${rpc_message_maxSize_val} \ + --conf spark.core.connection.ack.wait.timeout=${core_connection_ack_wait_timeout_val} \ + --conf spark.storage.memoryFraction=${storage_memoryFraction_val} \ + --conf spark.shuffle.memoryFraction=${shuffle_memoryFraction_val} \ + --conf spark.rdd.compress=${rdd_compress_val} \ + --conf spark.memory.useLegacyMode=${memory_useLegacyMode_val} \ + ./lib/pegasus-spark_2.11-0.1.0-SNAPSHOT_openSource.jar yarn ${data_path_val} ${output_path} ${queryGraph_path_val} ${num_colors_val} 232 "," ${graph_split_val} 10000 > sgm_temp.log + num_subgraphs=$(cat sgm_temp.log | grep "number of matched subgraphs" | awk -F '[\t]' '{print $2}') + costTime=$(cat sgm_temp.log | grep "cost time" | awk -F '[\t]' '{print $2}') + currentTime=$(date "+%Y%m%d_H%M%S") + rm -rf sgm_temp.log + echo -e "algorithmName: SGM\ncostTime: $costTime\ndatasetName: ${dataset_name}\nisRaw: 'yes'\nnum_subgraphs: $num_subgraphs\ntestcaseType: SGM_opensource_${1}_${2}_opensource\n" > ./report/"SGM_${currentTime}.yml" + echo "Exec Successful: End." > ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph/tc_run.sh b/tools/kal-test/bin/graph/tc_run.sh new file mode 100644 index 0000000..f85cde6 --- /dev/null +++ b/tools/kal-test/bin/graph/tc_run.sh @@ -0,0 +1,144 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of dataset: graph500_22, graph500_23, graph500_24, graph500_25, graph500_26" + echo "2nd argument: name of api: run, preCanonical" + echo "3rd argument: optimization algorithm or raw: no, yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 3 ];then + alg_usage + exit 0 +fi + +dataset_name=$1 +api_name=$2 +is_raw=$3 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +source conf/graph/tc/tc_spark.properties +num_executors_val="numExecutors_${cpu_name}" +executor_cores_val="executorCores" +executor_memory_val="executorMemory_${cpu_name}" +extra_java_options_val="extraJavaOptions_${cpu_name}" +master_val="master" +deploy_mode_val="deployMode" +driver_cores_val="driverCores" +driver_memory_val="driverMemory" +num_executors=${!num_executors_val} +executor_cores=${!executor_cores_val} +executor_memory=${!executor_memory_val} +extra_java_options=${!extra_java_options_val} +master=${!master_val} +deploy_mode=${!deploy_mode_val} +driver_cores=${!driver_cores_val} +driver_memory=${!driver_memory_val} +if [ ! ${num_executors} ] \ + || [ ! ${executor_cores} ] \ + || [ ! ${executor_memory} ] \ + || [ ! ${deploy_mode} ] \ + || [ ! ${driver_cores} ] \ + || [ ! ${driver_memory} ] \ + || [ ! ${master} ]; then + echo "Some values are NUll,please confirm with the property files" + exit 0 +fi +echo "${master_val}:${master}" +echo "${deploy_mode_val}:${deploy_mode}" +echo "${num_executors_val}:${num_executors}" +echo "${executor_cores_val}:${executor_cores}" +echo "${executor_memory_val}:${executor_memory}" +echo "${extra_java_options_val}:${extra_java_options}" +echo "${driver_memory_val}:${driver_memory}" +echo "${driver_cores_val}:${driver_cores}" + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +input_path=${!dataset_name} +output_path="${output_path_prefix}/tc/${is_raw}/${dataset_name}_${api_name}" +echo "${dataset_name} : ${input_path}" +echo "outputPath : ${output_path}" +hdfs dfs -rm -r -f ${output_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs -- tc-${dataset_name}-${api_name}" +if [ ${is_raw} == "no" ]; then + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.TriangleCountRunner \ + --deploy-mode ${deploy_mode} \ + --driver-cores ${driver_cores} \ + --driver-memory ${driver_memory} \ + --num-executors ${num_executors} \ + --executor-cores ${executor_cores} \ + --executor-memory ${executor_memory} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options} -XX:SurvivorRatio=4 -XX:ParallelGCThreads=6" \ + --conf spark.rpc.askTimeout=36000 \ + --conf spark.scheduler.maxRegisteredResourcesWaitingTime=3600000 \ + --conf spark.worker.timeout=3600 \ + --conf spark.network.timeout=6000s \ + --conf spark.storage.blockManagerSlaveTimeoutMs=600000 \ + --conf spark.shuffle.blockTransferService=nio \ + --conf spark.driver.maxResultSize=100g \ + --conf spark.shuffle.manager=SORT \ + --conf spark.broadcast.blockSize=25g \ + --conf spark.rpc.message.maxSize=2046 \ + --conf spark.core.connection.ack.wait.timeout=60000s \ + --conf spark.storage.memoryFraction=0.2 \ + --conf spark.shuffle.memoryFraction=0.6 \ + --conf spark.rdd.compress=true \ + --jars "lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${input_path} ${output_path} ${api_name} ${is_raw} | tee ./log/log +else + spark-submit \ + --class com.bigdata.graph.TriangleCountRunner \ + --deploy-mode ${deploy_mode} \ + --driver-cores ${driver_cores} \ + --driver-memory ${driver_memory} \ + --num-executors ${num_executors} \ + --executor-cores ${executor_cores} \ + --executor-memory ${executor_memory} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options} -XX:SurvivorRatio=4 -XX:ParallelGCThreads=6" \ + --conf spark.rpc.askTimeout=36000 \ + --conf spark.scheduler.maxRegisteredResourcesWaitingTime=3600000 \ + --conf spark.worker.timeout=3600 \ + --conf spark.network.timeout=6000s \ + --conf spark.storage.blockManagerSlaveTimeoutMs=600000 \ + --conf spark.shuffle.blockTransferService=nio \ + --conf spark.driver.maxResultSize=100g \ + --conf spark.shuffle.manager=SORT \ + --conf spark.broadcast.blockSize=25g \ + --conf spark.rpc.message.maxSize=2046 \ + --conf spark.core.connection.ack.wait.timeout=60000s \ + --conf spark.storage.memoryFraction=0.2 \ + --conf spark.shuffle.memoryFraction=0.6 \ + --conf spark.rdd.compress=true \ + --driver-class-path "lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${input_path} ${output_path} ${api_name} ${is_raw} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph/tpr_run.sh b/tools/kal-test/bin/graph/tpr_run.sh new file mode 100644 index 0000000..e76c415 --- /dev/null +++ b/tools/kal-test/bin/graph/tpr_run.sh @@ -0,0 +1,124 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of dataset: twitter_tpr" + echo "2nd argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 2 ];then + alg_usage + exit 0 +fi + +source conf/graph/tpr/tpr_spark.properties + +dataset_name=$1 +is_raw=$2 + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +# concatnate strings as a new variable +num_executors="numExectuors_"${dataset_name}_${cpu_name} +executor_cores="executorCores_"${dataset_name}_${cpu_name} +executor_memory="executorMemory_"${dataset_name}_${cpu_name} +extra_java_options="extraJavaOptions_"${dataset_name}_${cpu_name} +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +deploy_mode_val=${!deploy_mode} + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${deploy_mode} : ${deploy_mode_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${extra_java_options_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +data_path=${dataset_name} +data_path_val=${!data_path} +echo "${dataset_name} : ${data_path_val}" + +output_path="${output_path_prefix}/tpr/${is_raw}/${dataset_name}" +hdfs dfs -rm -r -f ${output_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- TrillionPageRank" +if [ ${is_raw} == "no" ]; then + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.TrillionPageRankRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 80g \ + --conf spark.driver.maxResultSize=80g \ + --conf spark.locality.wait.node=0 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.kryoserializer.buffer.max=2040m \ + --conf spark.rdd.compress=true \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${data_path_val} ${output_path} ${is_raw} | tee ./log/log +else + scp lib/kal-test_${scala_version_val}-0.1.jar root@agent1:/opt/graph_classpath/ + scp lib/kal-test_${scala_version_val}-0.1.jar root@agent2:/opt/graph_classpath/ + scp lib/kal-test_${scala_version_val}-0.1.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.TrillionPageRankRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 80g \ + --conf spark.driver.maxResultSize=80g \ + --conf spark.locality.wait.node=0 \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.kryoserializer.buffer.max=2040m \ + --conf spark.rdd.compress=true \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --driver-class-path "lib/snakeyaml-1.19.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/kal-test_${scala_version_val}-0.1.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${data_path_val} ${output_path} ${is_raw} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph/tr_run.sh b/tools/kal-test/bin/graph/tr_run.sh new file mode 100644 index 0000000..d88a27d --- /dev/null +++ b/tools/kal-test/bin/graph/tr_run.sh @@ -0,0 +1,117 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of dataset: cit_patents,uk_2002,arabic_2005" + echo "2nd argument: name of api: run,runUntilConvergence" + echo "3nd argument: seeds count: 100,500,1000" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 3 ];then + alg_usage + exit 0 +fi + +source conf/graph/tr/tr_spark.properties + +dataset_name=$1 +api_name=$2 +seedsCount=$3 + +if [ ${dataset_name} != "cit_patents" ] && + [ ${dataset_name} != "uk_2002" ] && + [ ${dataset_name} != "arabic_2005" ] ;then + echo "invalid dataset name,dataset name:cit_patents,uk_2002,arabic_2005" + exit 1 +fi +if [ ${api_name} != "run" ] && + [ ${api_name} != "runUntilConvergence" ] ;then + echo "invalid argument value,api name: run,runUntilConvergence" + exit 1 +fi +if [ ${seedsCount} != "100" ] && [ ${seedsCount} != "500" ] && [ ${seedsCount} != "1000" ];then + echo "invalid argument value,must be: 100 or 500 or 1000" + exit 1 +fi + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +# concatnate strings as a new variable +num_executors="${api_name}_${dataset_name}_${seedsCount}_numExecutors_${cpu_name}" +executor_cores="${api_name}_${dataset_name}_${seedsCount}_executorCores_${cpu_name}" +executor_memory="${api_name}_${dataset_name}_${seedsCount}_executorMemory_${cpu_name}" +num_partitions="${api_name}_${dataset_name}_${seedsCount}_numPartitions_${cpu_name}" +extra_Java_Options="${api_name}_${dataset_name}_${seedsCount}_extraJavaOptions_${cpu_name}" +deploy_mode="deployMode" +driver_memory="driverMemory" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +deploy_mode_val=${!deploy_mode} +num_partitions_val=${!num_partitions} +extra_Java_Options_val=${!extra_Java_Options} +driver_memory_val=${!driver_memory} + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${num_partitions} : ${num_partitions_val}" +echo "${driver_memory}:${driver_memory_val}" +echo "${extra_Java_Options}:${extra_Java_Options_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${num_partitions_val} ] ; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +data_path_val=${!dataset_name} +output_path="${output_path_prefix}/tr/${dataset_name}_${seedsCount}_${api_name}" +echo "${dataset_name} : ${data_path_val}" +echo "output_path : ${output_path}" +hdfs dfs -rm -r -f ${output_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs -- tr-${dataset_name}-${api_name}-${seedsCount}" +spark-submit \ +--class com.bigdata.graph.TrustRankRunner \ +--master yarn \ +--deploy-mode ${deploy_mode_val} \ +--num-executors ${num_executors_val} \ +--executor-memory ${executor_memory_val} \ +--executor-cores ${executor_cores_val} \ +--driver-memory ${driver_memory_val} \ +--conf spark.executor.extraJavaOptions=${extra_Java_Options_val} \ +--conf spark.driver.maxResultSize=200g \ +--conf spark.driver.extraJavaOptions="-Xms100G" \ +--conf spark.locality.wait.node=0 \ +--jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${num_partitions_val} "no" ${data_path_val} ${api_name} ${seedsCount} ${output_path} | tee ./log/log diff --git a/tools/kal-test/bin/graph/wce_run.sh b/tools/kal-test/bin/graph/wce_run.sh new file mode 100644 index 0000000..4eb0cfc --- /dev/null +++ b/tools/kal-test/bin/graph/wce_run.sh @@ -0,0 +1,88 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "please input 1 argument: " + echo "1st argument: name of dataset: graph500_24, graph500_25, graph500_26" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 1 ];then + alg_usage + exit 0 +fi + +dataset_name=$1 + +source conf/graph/wce/wce_spark.properties + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +# concatnate strings as a new variable +num_executors="${dataset_name}_numExectuors_${cpu_name}" +executor_cores="${dataset_name}_executorCores_${cpu_name}" +executor_memory="${dataset_name}_executorMemory_${cpu_name}" +extra_java_options="${dataset_name}_extraJavaOptions_${cpu_name}" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${extra_java_options_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +data_path_val=${!dataset_name} +output_path="${output_path_prefix}/wce/${dataset_name}" +echo "${dataset_name} : ${data_path_val}" +echo "output_path : ${output_path}" +hdfs dfs -rm -r -f ${output_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ +scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ +scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + +echo "start to submit spark jobs -- wce-${dataset_name}" +spark-submit \ +--class com.bigdata.graph.WCERunner \ +--driver-memory 80g \ +--master yarn \ +--num-executors ${num_executors_val} \ +--executor-cores ${executor_cores_val} \ +--executor-memory ${executor_memory_val} \ +--conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ +--jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--driver-class-path "lib/fastutil-8.3.1.jar:lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${data_path_val} ${output_path} | tee ./log/log diff --git a/tools/kal-test/bin/graph/wce_run_hive.sh b/tools/kal-test/bin/graph/wce_run_hive.sh new file mode 100644 index 0000000..474217b --- /dev/null +++ b/tools/kal-test/bin/graph/wce_run_hive.sh @@ -0,0 +1,48 @@ +#!/bin/bash +set -e + +case "$1" in +-h | --help | ?) + echo "Usage:
" + exit 0 + ;; +esac + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +if [ ${cpu_name} == "aarch64" ] +then + cpu_name="aarch_64" +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +table_name=$1 +col1=$2 +col2=$3 +maxIter=$4 +maxDegree=$5 +save_mode=$6 +save_arg=$7 + +spark-submit \ +--class com.bigdata.graph.WCEHiveRunner \ +--driver-memory 80g \ +--master yarn \ +--num-executors 35 \ +--executor-cores 8 \ +--executor-memory "25g" \ +--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ +--conf spark.rdd.compress=true \ +--conf spark.shuffle.compress=true \ +--conf spark.shuffle.spill.compress=true \ +--conf spark.io.compression.codec=lz4 \ +--jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +--conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ +./lib/kal-test_${scala_version_val}-0.1.jar ${table_name} ${col1} ${col2} ${maxIter} ${maxDegree} ${save_mode} ${save_arg} diff --git a/tools/kal-test/bin/graph/wlpa_run.sh b/tools/kal-test/bin/graph/wlpa_run.sh new file mode 100644 index 0000000..fdf399a --- /dev/null +++ b/tools/kal-test/bin/graph/wlpa_run.sh @@ -0,0 +1,117 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of dataset: enwiki_2018,arabic_2005,GAP_twitter" + echo "2nd argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 2 ];then + alg_usage + exit 0 +fi + +dataset_name=$1 +is_raw=$2 + +if [ $dataset_name != 'enwiki_2018' ] && [ $dataset_name != 'arabic_2005' ] && [ $dataset_name != 'GAP_twitter' ]; +then + echo 'invalid dataset' + echo "dataset name: enwiki_2018 or arabic_2005 or GAP_twitter" + exit 0 +fi + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${cpu_name}-${is_raw} + +source conf/graph/wlpa/wlpa_spark.properties +num_executors_val="numExecutors_${dataset_name}_${cpu_name}" +executor_cores_val="executorCores_${dataset_name}_${cpu_name}" +executor_memory_val="executorMemory_${dataset_name}_${cpu_name}" +executor_extra_javaopts_val="executorExtraJavaopts_${dataset_name}_${cpu_name}" + +master_val="master" +deploy_mode_val="deployMode" +driver_memory_val="driverMemory" +num_executors=${!num_executors_val} +executor_cores=${!executor_cores_val} +executor_memory=${!executor_memory_val} +master=${!master_val} +driver_memory=${!driver_memory_val} +deploy_mode=${!deploy_mode_val} +executor_extra_javaopts=${!executor_extra_javaopts_val} +if [ ! ${num_executors} ] \ + || [ ! ${executor_cores} ] \ + || [ ! ${executor_memory} ] \ + || [ ! ${master} ]; then + echo "Some values are NUll, please confirm with the property files" + exit 0 +fi +echo "${master_val}:${master}" +echo "${deploy_mode_val}:${deploy_mode}" +echo "${num_executors_val}:${num_executors}" +echo "${executor_cores_val}:${executor_cores}" +echo "${executor_memory_val}:${executor_memory}" +echo "${executor_extra_javaopts_val}:${executor_extra_javaopts}" + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +input_path=${!dataset_name} +output_path="/tmp/graph/result/wlpa/${is_raw}/${dataset_name}" +echo "${dataset_name}: ${input_path},${output_path}" + +echo "start to clean exist output" +hdfs dfs -rm -r -f -skipTrash ${output_path} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs -- wlpa-${dataset_name}" + +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.WeightedLablePropagationRunner \ + --deploy-mode ${deploy_mode} \ + --driver-memory ${driver_memory} \ + --num-executors ${num_executors} \ + --executor-cores ${executor_cores} \ + --executor-memory ${executor_memory} \ + --conf "spark.executor.extraJavaOptions=${executor_extra_javaopts}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/fastutil-8.3.1.jar:/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${input_path} ${output_path} | tee ./log/log +else + spark-submit \ + --class com.bigdata.graph.WeightedLablePropagationRunner \ + --deploy-mode ${deploy_mode} \ + --driver-memory ${driver_memory} \ + --num-executors ${num_executors} \ + --executor-cores ${executor_cores} \ + --executor-memory ${executor_memory} \ + --conf "spark.executor.extraJavaOptions=${executor_extra_javaopts}" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${input_path} ${output_path} | tee ./log/log +fi diff --git a/tools/kal-test/bin/graph/wpr_run.sh b/tools/kal-test/bin/graph/wpr_run.sh new file mode 100644 index 0000000..c24781c --- /dev/null +++ b/tools/kal-test/bin/graph/wpr_run.sh @@ -0,0 +1,148 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: cage14, GAP_road, GAP_twitter" + echo "2nd argument: name of api: static, convergence" + echo "3rd argument: optimization algorithm or raw: no, yes" +} +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 3 ];then + usage + exit 0 +fi + +dataset_name=$1 +api_name=$2 +is_raw=$3 + +if [ ${dataset_name} != "GAP_road" ] && [ ${dataset_name} != "cage14" ] && [ ${dataset_name} != "GAP_twitter" ];then + echo "invalid dataset name, dataset name:GAP_road, cage14, GAP_twitter" + exit 1 +fi +if [ ${api_name} != "static" ] && [ ${api_name} != "convergence" ];then + echo "invalid api name,api name: static or convergence" + exit 1 +fi + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +prefix="run" +if [ ${api_name} == "runUntilConvergence" ] +then + prefix="convergence" +fi + +source conf/graph/wpr/wpr_spark.properties +# concatnate strings as a new variable +deploy_mode="deployMode" +num_executors="numExecutors_"${cpu_name} +executor_cores="executorCores_"${cpu_name} +executor_memory="executorMemory_"${cpu_name} +extra_java_options="extraJavaOptions_"${cpu_name} +split="split_graph" + +if [ ${is_raw} != "no" ]; then + num_executors=${api_name}_${dataset_name}_numExecutors + executor_cores=${api_name}_${dataset_name}_executorCores + executor_memory=${api_name}_${dataset_name}_executorMemory + extra_java_options=${api_name}_${dataset_name}_extraJavaOptions + partition=${api_name}_${dataset_name}_partition + iter=${api_name}_iter + tolerance=${api_name}_tolerance + + iter_val=${!iter} + tolerance_val=${!tolerance} + partition_val=${!partition} +fi + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +deploy_mode_val=${!deploy_mode} +split_val=${!split} + +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "split : ${split_val}" + +if [ ! ${num_executors_val} ] || + [ ! ${executor_cores_val} ] || + [ ! ${executor_memory_val} ] || + [ ! ${extra_java_options_val} ] || + [ ! ${split_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +data_path_val=${!dataset_name} +output_path_val=${output_path_prefix}/wpr/${is_raw}/${dataset_name}_${api_name} +echo "${dataset_name} : ${data_path_val}" +echo "output_path : ${output_path_val}" +hdfs dfs -rm -r -f ${output_path_val} + +echo "start to clean cache and sleep 3s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 3 + +echo "start to submit spark jobs -- wpr-${dataset_name}-${api_name}" +if [ ${is_raw} == "no" ]; then + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/graph_classpath/ + scp lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/graph_classpath/ + + spark-submit \ + --class com.bigdata.graph.WeightedPageRankRunner \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 100g \ + --conf spark.driver.maxResultSize=200g \ + --conf spark.driver.extraJavaOptions="-Xms100G" \ + --conf spark.locality.wait.node=0 \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf spark.shuffle.blockTransferService=nio \ + --jars "lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/graph_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${dataset_name} ${data_path_val} ${output_path_val} ${api_name} ${is_raw} ${split_val} | tee ./log/log +else + spark-submit \ + --class com.soundcloud.spark.pagerank.SparkPageRankTest \ + --master yarn \ + --deploy-mode ${deploy_mode_val} \ + --num-executors ${num_executors_val} \ + --executor-memory ${executor_memory_val} \ + --executor-cores ${executor_cores_val} \ + --driver-memory 100g \ + --conf spark.driver.maxResultSize=200g \ + --conf spark.driver.extraJavaOptions="-Xms100G" \ + --conf spark.locality.wait.node=0 \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf spark.shuffle.blockTransferService=nio \ + ./lib/spark-pagerank-1.0-SNAPSHOT.jar ${data_path_val} ${split_val} ${partition_val} ${output_path_val} 0.15 ${iter_val} ${tolerance_val}| tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/graph_workflow.sh b/tools/kal-test/bin/graph_workflow.sh new file mode 100644 index 0000000..5b6b1f2 --- /dev/null +++ b/tools/kal-test/bin/graph_workflow.sh @@ -0,0 +1,312 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: optimization algorithm or raw: no/yes" + echo "2nd argument: verify result: no/yes" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -ne 2 ]; then + alg_usage + exit 0 +fi + +is_raw=$1 +is_check=$2 + +type=opt +if [ $is_raw == "yes" ]; then + type=raw +fi + +function createDir() { + dir=$1 + if [ ! -d $dir ]; then + mkdir $dir + fi +} +createDir logs +createDir log +createDir report + +graph_classpath=/opt/graph_classpath/ +function ssh_mkdir() { + server=$1 + dir=$2 + ssh $server "mkdir -p $dir" +} +ssh_mkdir agent1 $graph_classpath +ssh_mkdir agent2 $graph_classpath +ssh_mkdir agent3 $graph_classpath + +# betweenness +./bin/graph/betweenness_run.sh cit_patents ${is_raw} ${is_check} 2>&1 | tee -a logs/betweenness_cit_patents_${type}.log +./bin/graph/betweenness_run.sh enwiki_2018 ${is_raw} ${is_check} 2>&1 | tee -a logs/betweenness_enwiki_2018_${type}.log +./bin/graph/betweenness_run.sh uk_2002 ${is_raw} ${is_check} 2>&1 | tee -a logs/betweenness_uk_2002_${type}.log + +# bfs +./bin/graph/bfs_run.sh cit_patents ${is_raw} 2>&1 | tee -a logs/bfs_cit_patents_${type}.log +./bin/graph/bfs_run.sh enwiki_2018 ${is_raw} 2>&1 | tee -a logs/bfs_enwiki_2018_${type}.log +./bin/graph/bfs_run.sh arabic_2005 ${is_raw} 2>&1 | tee -a logs/bfs_arabic_2005_${type}.log +./bin/graph/bfs_run.sh graph500_22 ${is_raw} 2>&1 | tee -a logs/bfs_graph500_22_${type}.log +./bin/graph/bfs_run.sh graph500_23 ${is_raw} 2>&1 | tee -a logs/bfs_graph500_23_${type}.log +./bin/graph/bfs_run.sh graph500_25 ${is_raw} 2>&1 | tee -a logs/bfs_graph500_25_${type}.log + +# cc +./bin/graph/cc_run.sh graph500_25 ${is_raw} 2>&1 | tee -a logs/cc_graph500_25_${type}.log +./bin/graph/cc_run.sh graph500_26 ${is_raw} 2>&1 | tee -a logs/cc_graph500_26_${type}.log +./bin/graph/cc_run.sh liveJournal ${is_raw} 2>&1 | tee -a logs/cc_liveJournal_${type}.log + +# cd +./bin/graph/cd_run.sh simulate1 2>&1 | tee -a logs/cd_simulate1.log +./bin/graph/cd_run.sh simulate2 2>&1 | tee -a logs/cd_simulate2.log +./bin/graph/cd_run.sh usaRoad 2>&1 | tee -a logs/cd_usaRoad.log + +# closeness +./bin/graph/closeness_run.sh cit_patents weighted ${is_check} 2>&1 | tee -a logs/closeness_cit_patents_weighted_${type}.log +./bin/graph/closeness_run.sh uk_2002 weighted ${is_check} 2>&1 | tee -a logs/closeness_uk_2002_weighted_${type}.log + +./bin/graph/closeness_run.sh cit_patents unweighted ${is_check} 2>&1 | tee -a logs/closeness_cit_patents_unweighted_${type}.log +./bin/graph/closeness_run.sh uk_2002 unweighted ${is_check} 2>&1 | tee -a logs/closeness_uk_2002_unweighted_${type}.log + +# clusteringcoefficient +./bin/graph/clusteringcoefficient_run.sh cit_patents lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_cit_patents_lcc_weighted_${type}.log +./bin/graph/clusteringcoefficient_run.sh uk_2002 lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_uk_2002_lcc_weighted_${type}.log +./bin/graph/clusteringcoefficient_run.sh arabic_2005 lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_arabic_2005_lcc_weighted_${type}.log +./bin/graph/clusteringcoefficient_run.sh cit_patents lcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_cit_patents_lcc_unweighted_${type}.log +./bin/graph/clusteringcoefficient_run.sh uk_2002 lcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_uk_2002_lcc_unweighted_${type}.log +./bin/graph/clusteringcoefficient_run.sh arabic_2005 lcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_arabic_2005_lcc_unweighted_${type}.log + +./bin/graph/clusteringcoefficient_run.sh graph500_22 lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_22_lcc_weighted_${type}.log +./bin/graph/clusteringcoefficient_run.sh graph500_23 lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_23_lcc_weighted_${type}.log +./bin/graph/clusteringcoefficient_run.sh graph500_24 lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_24_lcc_weighted_${type}.log +./bin/graph/clusteringcoefficient_run.sh graph500_25 lcc weighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_25_lcc_weighted_${type}.log +./bin/graph/clusteringcoefficient_run.sh graph500_22 lcc unweighted ${is_raw} 2>&1 | tee -a logs/clusteringcoefficient_graph500_22_lcc_unweighted_${type}.log +./bin/graph/clusteringcoefficient_run.sh graph500_23 lcc unweighted ${is_raw} 2>&1 | tee -a logs/clusteringcoefficient_graph500_23_lcc_unweighted_${type}.log +./bin/graph/clusteringcoefficient_run.sh graph500_24 lcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_24_lcc_unweighted_${type}.log +./bin/graph/clusteringcoefficient_run.sh graph500_25 lcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_25_lcc_unweighted_${type}.log + +./bin/graph/clusteringcoefficient_run.sh graph500_22 avgcc unweighted ${is_raw} 2>&1 | tee -a logs/clusteringcoefficient_graph500_22_avgcc_unweighted_${type}.log +./bin/graph/clusteringcoefficient_run.sh graph500_23 avgcc unweighted ${is_raw} 2>&1 | tee -a logs/clusteringcoefficient_graph500_23_avgcc_unweighted_${type}.log +./bin/graph/clusteringcoefficient_run.sh graph500_24 avgcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_24_avgcc_unweighted_${type}.log +./bin/graph/clusteringcoefficient_run.sh graph500_25 avgcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_25_avgcc_unweighted_${type}.log + +./bin/graph/clusteringcoefficient_run.sh graph500_22 globalcc unweighted ${is_raw} 2>&1 | tee -a logs/clusteringcoefficient_graph500_22_globalcc_unweighted_${type}.log +./bin/graph/clusteringcoefficient_run.sh graph500_23 globalcc unweighted ${is_raw} 2>&1 | tee -a logs/clusteringcoefficient_graph500_23_globalcc_unweighted_${type}.log +./bin/graph/clusteringcoefficient_run.sh graph500_24 globalcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_24_globalcc_unweighted_${type}.log +./bin/graph/clusteringcoefficient_run.sh graph500_25 globalcc unweighted no 2>&1 | tee -a logs/clusteringcoefficient_graph500_25_globalcc_unweighted_${type}.log + +# degree +./bin/graph/degree_run.sh mycielskian20 degrees ${is_raw} 2>&1 | tee -a logs/degree_mycielskian20_degrees_${type}.log +./bin/graph/degree_run.sh gap_kron degrees ${is_raw} 2>&1 | tee -a logs/degree_gap_kron_degrees_${type}.log +./bin/graph/degree_run.sh com_friendster degrees ${is_raw} 2>&1 | tee -a logs/degree_com_friendster_degrees_${type}.log + +./bin/graph/degree_run.sh it_2004 inDegrees ${is_raw} 2>&1 | tee -a logs/degree_it_2004_inDegrees_${type}.log +./bin/graph/degree_run.sh twitter7 inDegrees ${is_raw} 2>&1 | tee -a logs/degree_twitter7_inDegrees_${type}.log +./bin/graph/degree_run.sh uk_2007_05 inDegrees ${is_raw} 2>&1 | tee -a logs/degree_uk_2007_05_inDegrees_${type}.log + +./bin/graph/degree_run.sh it_2004 outDegrees ${is_raw} 2>&1 | tee -a logs/degree_it_2004_outDegrees_${type}.log +./bin/graph/degree_run.sh twitter7 outDegrees ${is_raw} 2>&1 | tee -a logs/degree_twitter7_outDegrees_${type}.log +./bin/graph/degree_run.sh uk_2007_05 outDegrees ${is_raw} 2>&1 | tee -a logs/degree_uk_2007_05_outDegrees_${type}.log + +# incpr +./bin/graph/incpr_run.sh twitter_2010 0.001 1 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.001_1_${type}.log +./bin/graph/incpr_run.sh twitter_2010 0.001 2 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.001_2_${type}.log +./bin/graph/incpr_run.sh twitter_2010 0.001 3 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.001_3_${type}.log +./bin/graph/incpr_run.sh twitter_2010 0.001 4 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.001_4_${type}.log +./bin/graph/incpr_run.sh twitter_2010 0.001 5 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.001_5_${type}.log + +./bin/graph/incpr_run.sh twitter_2010 0.01 1 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.01_1_${type}.log +./bin/graph/incpr_run.sh twitter_2010 0.01 2 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.01_2_${type}.log +./bin/graph/incpr_run.sh twitter_2010 0.01 3 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.01_3_${type}.log +./bin/graph/incpr_run.sh twitter_2010 0.01 4 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.01_4_${type}.log +./bin/graph/incpr_run.sh twitter_2010 0.01 5 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.01_5_${type}.log + +./bin/graph/incpr_run.sh twitter_2010 0.05 1 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.05_1_${type}.log +./bin/graph/incpr_run.sh twitter_2010 0.05 2 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.05_2_${type}.log +./bin/graph/incpr_run.sh twitter_2010 0.05 3 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.05_3_${type}.log +./bin/graph/incpr_run.sh twitter_2010 0.05 4 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.05_4_${type}.log +./bin/graph/incpr_run.sh twitter_2010 0.05 5 ${is_raw} 2>&1 | tee -a logs/incpr_twitter_2010_0.05_5_${type}.log + +# kcore +./bin/graph/kcore_run.sh graph500_22 ${is_raw} 2>&1 | tee -a logs/kcore_graph500_22_${type}.log +./bin/graph/kcore_run.sh graph500_23 ${is_raw} 2>&1 | tee -a logs/kcore_graph500_23_${type}.log +./bin/graph/kcore_run.sh graph500_25 ${is_raw} 2>&1 | tee -a logs/kcore_graph500_25_${type}.log +./bin/graph/kcore_run.sh graph500_26 ${is_raw} 2>&1 | tee -a logs/kcore_graph500_26_${type}.log + +# louvain +./bin/graph/louvain_run.sh graph500_22 ${is_raw} 2>&1 | tee -a logs/louvain_graph500_22_${type}.log +./bin/graph/louvain_run.sh graph500_24 ${is_raw} 2>&1 | tee -a logs/louvain_graph500_24_${type}.log +./bin/graph/louvain_run.sh graph500_25 ${is_raw} 2>&1 | tee -a logs/louvain_graph500_25_${type}.log + +./bin/graph/louvain_run.sh cit_patents no 2>&1 | tee -a logs/louvain_cit_patents_${type}.log +./bin/graph/louvain_run.sh uk_2002 no 2>&1 | tee -a logs/louvain_uk_2002_${type}.log +./bin/graph/louvain_run.sh arabic_2005 no 2>&1 | tee -a logs/louvain_arabic_2005_${type}.log + +# lpa +./bin/graph/lpa_run.sh graph500_22 runConvergence no 2>&1 | tee -a logs/lpa_graph500_22_runConvergence_${type}.log +./bin/graph/lpa_run.sh graph500_24 runConvergence no 2>&1 | tee -a logs/lpa_graph500_24_runConvergence_${type}.log +./bin/graph/lpa_run.sh graph500_25 runConvergence no 2>&1 | tee -a logs/lpa_graph500_25_runConvergence_${type}.log + +./bin/graph/lpa_run.sh graph500_22 run ${is_raw} 2>&1 | tee -a logs/lpa_graph500_22_run_${type}.log +./bin/graph/lpa_run.sh graph500_24 run ${is_raw} 2>&1 | tee -a logs/lpa_graph500_24_run_${type}.log +./bin/graph/lpa_run.sh graph500_25 run ${is_raw} 2>&1 | tee -a logs/lpa_graph500_25_run_${type}.log + +# mce +./bin/graph/mce_run.sh graph500_23 2>&1 | tee -a logs/mce_graph500_23_${type}.log +./bin/graph/mce_run.sh graph500_24 2>&1 | tee -a logs/mce_graph500_24_${type}.log +./bin/graph/mce_run.sh graph500_25 2>&1 | tee -a logs/mce_graph500_25_${type}.log + +# modularity +./bin/graph/modularity_run.sh graph500_23 ${is_raw} 2>&1 | tee -a logs/modularity_graph500_23_${type}.log +./bin/graph/modularity_run.sh graph500_25 ${is_raw} 2>&1 | tee -a logs/modularity_graph500_25_${type}.log +./bin/graph/modularity_run.sh graph500_26 ${is_raw} 2>&1 | tee -a logs/modularity_graph500_26_${type}.log + +./bin/graph/modularity_run.sh uk_2002 no 2>&1 | tee -a logs/modularity_uk_${type}.log +./bin/graph/modularity_run.sh arabic_2005 no 2>&1 | tee -a logs/modularity_arabic_${type}.log +./bin/graph/modularity_run.sh twitter no 2>&1 | tee -a logs/modularity_twitter_${type}.log + +# mssp +./bin/graph/mssp_run.sh soc_liveJournal 5 ${is_raw} 2>&1 | tee -a logs/mssp_liveJournal_5_${type}.log +./bin/graph/mssp_run.sh uk_2002 5 ${is_raw} 2>&1 | tee -a logs/mssp_uk_2002_5_${type}.log +./bin/graph/mssp_run.sh arabic_2005 5 ${is_raw} 2>&1 | tee -a logs/mssp_arabic_2005_5_${type}.log + +./bin/graph/mssp_run.sh soc_liveJournal 50 ${is_raw} 2>&1 | tee -a logs/mssp_liveJournal_50_${type}.log +./bin/graph/mssp_run.sh uk_2002 50 ${is_raw} 2>&1 | tee -a logs/mssp_uk_2002_50_${type}.log +./bin/graph/mssp_run.sh arabic_2005 50 ${is_raw} 2>&1 | tee -a logs/mssp_arabic_2005_50_${type}.log + +# node2vec +./bin/graph/node2vec_run.sh cit_patents ${is_raw} ${is_check} 2>&1 | tee -a logs/node2vec_cit_patents_${type}.log +./bin/graph/node2vec_run.sh soc_liveJournal no ${is_check} 2>&1 | tee -a logs/node2vec_soc_liveJournal_${type}.log +./bin/graph/node2vec_run.sh uk_2002 no ${is_check} 2>&1 | tee -a logs/node2vec_uk_2002_${type}.log + +# ppr +./bin/graph/ppr_run.sh cit_patents fixMS ${is_raw} 1 2>&1 | tee -a logs/ppr_cit_patents_fixMS_1_${type}.log +./bin/graph/ppr_run.sh cit_patents fixMS ${is_raw} 5 2>&1 | tee -a logs/ppr_cit_patents_fixMS_5_${type}.log +./bin/graph/ppr_run.sh cit_patents fixMS ${is_raw} 10 2>&1 | tee -a logs/ppr_cit_patents_fixMS_10_${type}.log +./bin/graph/ppr_run.sh cit_patents fixMS ${is_raw} 50 2>&1 | tee -a logs/ppr_cit_patents_fixMS_50_${type}.log +./bin/graph/ppr_run.sh cit_patents fixMS ${is_raw} 100 2>&1 | tee -a logs/ppr_cit_patents_fixMS_100_${type}.log + +./bin/graph/ppr_run.sh uk_2002 fixMS ${is_raw} 1 2>&1 | tee -a logs/ppr_uk_2002_fixMS_1_${type}.log +./bin/graph/ppr_run.sh uk_2002 fixMS ${is_raw} 5 2>&1 | tee -a logs/ppr_uk_2002_fixMS_5_${type}.log +./bin/graph/ppr_run.sh uk_2002 fixMS ${is_raw} 10 2>&1 | tee -a logs/ppr_uk_2002_fixMS_10_${type}.log +./bin/graph/ppr_run.sh uk_2002 fixMS ${is_raw} 50 2>&1 | tee -a logs/ppr_uk_2002_fixMS_50_${type}.log +./bin/graph/ppr_run.sh uk_2002 fixMS ${is_raw} 100 2>&1 | tee -a logs/ppr_uk_2002_fixMS_100_${type}.log + +./bin/graph/ppr_run.sh arabic_2005 fixMS ${is_raw} 1 2>&1 | tee -a logs/ppr_arabic_2005_fixMS_1_${type}.log +./bin/graph/ppr_run.sh arabic_2005 fixMS ${is_raw} 5 2>&1 | tee -a logs/ppr_arabic_2005_fixMS_5_${type}.log +./bin/graph/ppr_run.sh arabic_2005 fixMS ${is_raw} 10 2>&1 | tee -a logs/ppr_arabic_2005_fixMS_10_${type}.log +./bin/graph/ppr_run.sh arabic_2005 fixMS ${is_raw} 50 2>&1 | tee -a logs/ppr_arabic_2005_fixMS_50_${type}.log +./bin/graph/ppr_run.sh arabic_2005 fixMS ${is_raw} 100 2>&1 | tee -a logs/ppr_arabic_2005_fixMS_100_${type}.log + +./bin/graph/ppr_run.sh cit_patents fixSS ${is_raw} 2>&1 | tee -a logs/ppr_cit_patents_fixSS_${type}.log +./bin/graph/ppr_run.sh uk_2002 fixSS ${is_raw} 2>&1 | tee -a logs/ppr_uk_2002_fixSS_${type}.log +./bin/graph/ppr_run.sh arabic_2005 fixSS ${is_raw} 2>&1 | tee -a logs/ppr_arabic_2005_fixSS_${type}.log + +./bin/graph/ppr_run.sh cit_patents conSS ${is_raw} 2>&1 | tee -a logs/ppr_cit_patents_conSS_${type}.log +./bin/graph/ppr_run.sh uk_2002 conSS ${is_raw} 2>&1 | tee -a logs/ppr_uk_2002_conSS_${type}.log +./bin/graph/ppr_run.sh arabic_2005 conSS ${is_raw} 2>&1 | tee -a logs/ppr_arabic_2005_conSS_${type}.log + +# pr +./bin/graph/pr_run.sh cit_patents run ${is_raw} 2>&1 | tee -a logs/pr_cit_patents_run_${type}.log +./bin/graph/pr_run.sh uk_2002 run ${is_raw} 2>&1 | tee -a logs/pr_uk_2002_run_${type}.log +./bin/graph/pr_run.sh arabic_2005 run ${is_raw} 2>&1 | tee -a logs/pr_arabic_2005_run_${type}.log + +./bin/graph/pr_run.sh cit_patents runUntilConvergence ${is_raw} 2>&1 | tee -a logs/pr_cit_patents_runUntilConvergence_${type}.log +./bin/graph/pr_run.sh uk_2002 runUntilConvergence ${is_raw} 2>&1 | tee -a logs/pr_uk_2002_runUntilConvergence_${type}.log +./bin/graph/pr_run.sh arabic_2005 runUntilConvergence ${is_raw} 2>&1 | tee -a logs/pr_arabic_2005_runUntilConvergence_${type}.log + +# scc +./bin/graph/scc_run.sh cit_patents ${is_raw} 2>&1 | tee -a logs/scc_cit_patents_${type}.log +./bin/graph/scc_run.sh enwiki_2018 ${is_raw} 2>&1 | tee -a logs/scc_enwiki_2018_${type}.log +./bin/graph/scc_run.sh arabic_2005 ${is_raw} 2>&1 | tee -a logs/scc_arabic_2005_${type}.log + +# sgm +./bin/graph/sgm_run.sh graph500_19 4dgn Identical no 2>&1 | tee -a logs/sgm_graph500_19_4dgn_Identical_${type}.log +./bin/graph/sgm_run.sh graph500_19 4sqr Identical no 2>&1 | tee -a logs/sgm_graph500_19_4sqr_Identical_${type}.log +./bin/graph/sgm_run.sh graph500_19 5tree Identical no 2>&1 | tee -a logs/sgm_graph500_19_5tree_Identical_${type}.log +./bin/graph/sgm_run.sh graph500_19 6star Identical no 2>&1 | tee -a logs/sgm_graph500_19_6star_Identical_${type}.log + +./bin/graph/sgm_run.sh liveJournal 4dgn Identical no 2>&1 | tee -a logs/sgm_liveJournal_4dgn_Identical_${type}.log +./bin/graph/sgm_run.sh liveJournal 4sqr Identical no 2>&1 | tee -a logs/sgm_liveJournal_4sqr_Identical_${type}.log +./bin/graph/sgm_run.sh liveJournal 5tree Identical no 2>&1 | tee -a logs/sgm_liveJournal_5tree_Identical_${type}.log +./bin/graph/sgm_run.sh liveJournal 6star Identical no 2>&1 | tee -a logs/sgm_liveJournal_6star_Identical_${type}.log + +./bin/graph/sgm_run.sh com_orkut 4dgn Identical no 2>&1 | tee -a logs/sgm_com_orkut_4dgn_Identical_${type}.log +./bin/graph/sgm_run.sh com_orkut 4sqr Identical no 2>&1 | tee -a logs/sgm_com_orkut_4sqr_Identical_${type}.log +./bin/graph/sgm_run.sh com_orkut 5tree Identical no 2>&1 | tee -a logs/sgm_com_orkut_5tree_Identical_${type}.log +./bin/graph/sgm_run.sh com_orkut 6star Identical no 2>&1 | tee -a logs/sgm_com_orkut_6star_Identical_${type}.log + +./bin/graph/sgm_run.sh graph500_19 4dgn unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_graph500_19_unIdentical_4dgn_${type}.log +./bin/graph/sgm_run.sh graph500_19 4clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_graph500_19_unIdentical_4clique_${type}.log +./bin/graph/sgm_run.sh graph500_19 5clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_graph500_19_unIdentical_5clique_${type}.log +./bin/graph/sgm_run.sh graph500_19 6clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_graph500_19_unIdentical_6clique_${type}.log + +./bin/graph/sgm_run.sh liveJournal 4dgn unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_liveJournal_unIdentical_4dgn_${type}.log +./bin/graph/sgm_run.sh liveJournal 4clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_liveJournal_unIdentical_4clique_${type}.log +./bin/graph/sgm_run.sh liveJournal 5clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_liveJournal_unIdentical_5clique_${type}.log +./bin/graph/sgm_run.sh liveJournal 6clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_liveJournal_unIdentical_6clique_${type}.log + +./bin/graph/sgm_run.sh com_orkut 4dgn unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_com_orkut_unIdentical_4dgn_${type}.log +./bin/graph/sgm_run.sh com_orkut 4clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_com_orkut_unIdentical_4clique_${type}.log +./bin/graph/sgm_run.sh com_orkut 5clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_com_orkut_unIdentical_5clique_${type}.log +./bin/graph/sgm_run.sh com_orkut 6clique unIdentical ${is_raw} 2>&1 | tee -a logs/sgm_com_orkut_unIdentical_6clique_${type}.log + +# tc +./bin/graph/tc_run.sh graph500_22 run ${is_raw} 2>&1 | tee -a logs/tc_graph500_22_run_${type}.log +./bin/graph/tc_run.sh graph500_23 run ${is_raw} 2>&1 | tee -a logs/tc_graph500_23_run_${type}.log +./bin/graph/tc_run.sh graph500_24 run ${is_raw} 2>&1 | tee -a logs/tc_graph500_24_run_${type}.log +./bin/graph/tc_run.sh graph500_25 run ${is_raw} 2>&1 | tee -a logs/tc_graph500_25_run_${type}.log +./bin/graph/tc_run.sh graph500_26 run ${is_raw} 2>&1 | tee -a logs/tc_graph500_26_run_${type}.log + +./bin/graph/tc_run.sh graph500_22 preCanonical ${is_raw} 2>&1 | tee -a logs/tc_graph500_22_preCanonical_${type}.log +./bin/graph/tc_run.sh graph500_23 preCanonical ${is_raw} 2>&1 | tee -a logs/tc_graph500_23_preCanonical_${type}.log +./bin/graph/tc_run.sh graph500_24 preCanonical ${is_raw} 2>&1 | tee -a logs/tc_graph500_24_preCanonical_${type}.log +./bin/graph/tc_run.sh graph500_25 preCanonical ${is_raw} 2>&1 | tee -a logs/tc_graph500_25_preCanonical_${type}.log +./bin/graph/tc_run.sh graph500_26 preCanonical ${is_raw} 2>&1 | tee -a logs/tc_graph500_26_preCanonical_${type}.log + +# tpr +./bin/graph/tpr_run.sh twitter_tpr ${is_raw} 2>&1 | tee -a logs/tpr_twitter_${type}.log + +# tr +./bin/graph/tr_run.sh cit_patents run 100 2>&1 | tee -a logs/tr_cit_patents_run_100_${type}.log +./bin/graph/tr_run.sh cit_patents run 500 2>&1 | tee -a logs/tr_cit_patents_run_500_${type}.log +./bin/graph/tr_run.sh cit_patents run 1000 2>&1 | tee -a logs/tr_cit_patents_run_1000_${type}.log +./bin/graph/tr_run.sh uk_2002 run 100 2>&1 | tee -a logs/tr_uk_2002_run_100_${type}.log +./bin/graph/tr_run.sh uk_2002 run 500 2>&1 | tee -a logs/tr_uk_2002_run_500_${type}.log +./bin/graph/tr_run.sh uk_2002 run 1000 2>&1 | tee -a logs/tr_uk_2002_run_1000_${type}.log +./bin/graph/tr_run.sh arabic_2005 run 100 2>&1 | tee -a logs/tr_arabic_2005_run_100_${type}.log +./bin/graph/tr_run.sh arabic_2005 run 500 2>&1 | tee -a logs/tr_arabic_2005_run_500_${type}.log +./bin/graph/tr_run.sh arabic_2005 run 1000 2>&1 | tee -a logs/tr_arabic_2005_run_1000_${type}.log + +./bin/graph/tr_run.sh cit_patents runUntilConvergence 100 2>&1 | tee -a logs/tr_cit_patents_runUntilConvergence_100_${type}.log +./bin/graph/tr_run.sh cit_patents runUntilConvergence 500 2>&1 | tee -a logs/tr_cit_patents_runUntilConvergence_500_${type}.log +./bin/graph/tr_run.sh cit_patents runUntilConvergence 1000 2>&1 | tee -a logs/tr_cit_patents_runUntilConvergence_1000_${type}.log +./bin/graph/tr_run.sh uk_2002 runUntilConvergence 100 2>&1 | tee -a logs/tr_uk_2002_runUntilConvergence_100_${type}.log +./bin/graph/tr_run.sh uk_2002 runUntilConvergence 500 2>&1 | tee -a logs/tr_uk_2002_runUntilConvergence_500_${type}.log +./bin/graph/tr_run.sh uk_2002 runUntilConvergence 1000 2>&1 | tee -a logs/tr_uk_2002_runUntilConvergence_1000_${type}.log +./bin/graph/tr_run.sh arabic_2005 runUntilConvergence 100 2>&1 | tee -a logs/tr_arabic_2005_runUntilConvergence_100_${type}.log +./bin/graph/tr_run.sh arabic_2005 runUntilConvergence 500 2>&1 | tee -a logs/tr_arabic_2005_runUntilConvergence_500_${type}.log +./bin/graph/tr_run.sh arabic_2005 runUntilConvergence 1000 2>&1 | tee -a logs/tr_arabic_2005_runUntilConvergence_1000_${type}.log + +# wce +./bin/graph/wce_run.sh graph500_24 2>&1 | tee -a logs/wce_graph500_24_${type}.log +./bin/graph/wce_run.sh graph500_25 2>&1 | tee -a logs/wce_graph500_25_${type}.log +./bin/graph/wce_run.sh graph500_26 2>&1 | tee -a logs/wce_graph500_26_${type}.log + +# wpr +./bin/graph/wpr_run.sh cage14 static ${is_raw} 2>&1 | tee -a logs/wpr_cage14_static_${type}.log +./bin/graph/wpr_run.sh GAP_road static ${is_raw} 2>&1 | tee -a logs/wpr_GAP_road_static_${type}.log +./bin/graph/wpr_run.sh GAP_twitter static ${is_raw} 2>&1 | tee -a logs/wpr_GAP_twitter_static_${type}.log + +./bin/graph/wpr_run.sh cage14 convergence ${is_raw} 2>&1 | tee -a logs/wpr_cage14_convergence_${type}.log +./bin/graph/wpr_run.sh GAP_road convergence ${is_raw} 2>&1 | tee -a logs/wpr_GAP_road_convergence_${type}.log +./bin/graph/wpr_run.sh GAP_twitter convergence ${is_raw} 2>&1 | tee -a logs/wpr_GAP_twitter_convergence_${type}.log diff --git a/tools/kal-test/bin/ml/als_run.sh b/tools/kal-test/bin/ml/als_run.sh new file mode 100644 index 0000000..c4bac6c --- /dev/null +++ b/tools/kal-test/bin/ml/als_run.sh @@ -0,0 +1,137 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: type of data structure: [dataframe/rdd]" + echo "2nd argument: name of dataset: e.g. als/alsbs/alsh" + echo "3rd argument: name of API: e.g. fit/fit1/fit2/fit3; for rdd: train" + echo "4th argument: optimization algorithm or raw: [no/yes]" + echo "5th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 5 ]; then + usage + exit 0 +fi + +source conf/ml/als/als_spark.properties +data_structure=$1 +dataset_name=$2 +api_name=$3 +is_raw=$4 +if_check=$5 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${data_structure}-${dataset_name}-${api_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors="numExectuors_"${cpu_name} +executor_cores="executorCores_"${cpu_name} +executor_memory="executorMemory_"${cpu_name} +driver_cores="driverCores_"${cpu_name} +driver_memory="driverMemory_"${cpu_name} +master_="master" +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} + + +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + + +spark_conf=${master_val}_${deploy_mode_val}_${num_executors_val}_${executor_cores_val}_${executor_memory_val} + +mkdir -p log + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- als-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.ALSRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --driver-java-options "-Xms20g -Xss5g" \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=-Xms20g -Xss5g" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --jars "lib/snakeyaml-1.19.jar,lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val} | tee ./log/log +else + spark-submit \ + --driver-class-path "lib/snakeyaml-1.19.jar:lib/fastutil-8.3.1.jar" \ + --class com.bigdata.ml.ALSRunner \ + --driver-java-options "-Xms15g -Xss5g" \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=-Xms20g -Xss5g" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --jars "lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/bo_run.sh b/tools/kal-test/bin/ml/bo_run.sh new file mode 100644 index 0000000..cc27621 --- /dev/null +++ b/tools/kal-test/bin/ml/bo_run.sh @@ -0,0 +1,129 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1rd argument: name of dataset: [BostonHousing/TitanicRf/TitanicGBT]" + echo "2th argument: optimization algorithm or raw: [no/yes]" + echo "3th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 3 ]; then + usage + exit 0 +fi + +source conf/ml/bo/bo_spark.properties +dataset_name=$1 +is_raw=$2 +if_check=$3 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors=${cpu_name}"_numExecutors" +executor_cores=${cpu_name}"_executorCores" +executor_memory=${cpu_name}"_executorMemory" +driver_cores=${cpu_name}"_driverCores" +driver_memory=${cpu_name}"_driverMemory" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} + +echo "master : ${master}" +echo "deployMode : ${deployMode}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master} ] \ + || [ ! ${deployMode} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path=${!dataset_name} + +echo "${dataset_name} : ${data_path}" + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- BayesianOptimization-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/snakeyaml-1.19.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/snakeyaml-1.19.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/snakeyaml-1.19.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.BORunner \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master} \ + --conf spark.locality.wait=0s \ + --conf spark.scheduler.minRegisteredResourcesRatio=1 \ + --conf spark.driver.maxResultSize=10g \ + --conf spark.network.timeout=60000s \ + --conf spark.rpc.askTimeout=60000s \ + --conf spark.executor.heartbeatInterval=600s \ + --conf spark.eventLog.enabled=false \ + --jars "lib/fastutil-8.3.1.jar,lib/snakeyaml-1.19.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/kal-test_${scala_version_val}-0.1.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log +else + spark-submit \ + --class com.tencent.angel.spark.automl.AngelBayesianOptimization \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master} \ + --conf spark.locality.wait=0s \ + --conf spark.scheduler.minRegisteredResourcesRatio=1 \ + --conf spark.driver.maxResultSize=10g \ + --conf spark.network.timeout=60000s \ + --conf spark.rpc.askTimeout=60000s \ + --conf spark.executor.heartbeatInterval=600s \ + --conf spark.eventLog.enabled=false \ + --driver-class-path "lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/cov_run.sh b/tools/kal-test/bin/ml/cov_run.sh new file mode 100644 index 0000000..109c27f --- /dev/null +++ b/tools/kal-test/bin/ml/cov_run.sh @@ -0,0 +1,145 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: e.g. CP10M1K/CP2M5K/CP1M10K" + echo "2nd argument: optimization algorithm or raw: [no/yes]" + echo "3rd argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 3 ]; then + usage + exit 0 +fi + +source conf/ml/cov/cov_spark.properties +dataset_name=$1 +is_raw=$2 +if_check=$3 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors="numExectuors_"${cpu_name} +executor_cores="executorCores_"${cpu_name} +executor_memory="executorMemory_"${cpu_name} +executor_memory_overhead="executorMemOverhead_"${cpu_name} +extra_java_options="extraJavaOptions_"${cpu_name} +driver_max_result_size="driverMaxResultSize" +driver_cores="driverCores_"${cpu_name} +driver_memory="driverMemory_"${cpu_name} +master_="master" +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +executor_memory_overhead_val=${!executor_memory_overhead} +extra_java_options_val=${!extra_java_options} +driver_max_result_size_val=${!driver_max_result_size} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} + +echo ${cpu_name} +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${executor_memory_overhead} : ${executor_memory_overhead_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${driver_max_result_size} : ${driver_max_result_size_val}" + + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${executor_memory_overhead_val} ] \ + || [ ! ${driver_max_result_size_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + + +echo "start to submit spark jobs --- Cov-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.CovRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.memoryOverhead=${executor_memory_overhead_val}" \ + --conf "spark.driver.maxResultSize=${driver_max_result_size_val}" \ + --conf "spark.network.timeout=3600s" \ + --conf "spark.executor.heartbeatInterval=1000s" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val} | tee ./log/log +else + scp lib/fastutil-8.3.1.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.CovRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.memoryOverhead=${executor_memory_overhead_val}" \ + --conf "spark.driver.maxResultSize=${driver_max_result_size_val}" \ + --jars "lib/fastutil-8.3.1.jar" \ + --driver-class-path "lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val} | tee ./log/log +fi + diff --git a/tools/kal-test/bin/ml/dbscan_run.sh b/tools/kal-test/bin/ml/dbscan_run.sh new file mode 100644 index 0000000..650e092 --- /dev/null +++ b/tools/kal-test/bin/ml/dbscan_run.sh @@ -0,0 +1,169 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: e.g. bremenSmall/farm/house" + echo "2nd argument: optimization algorithm or raw: [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 2 ]; then + usage + exit 0 +fi + +source conf/ml/dbscan/dbscan_spark.properties +dataset_name=$1 +is_raw=$2 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${is_raw} +type=opt +if [ $is_raw == "yes" ]; then + type=raw +fi + +# concatnate strings as a new variable +num_executors="numExectuors_"${type} +executor_cores="executorCores_"${type} +executor_memory="executorMemory_"${type} +extra_java_options="extraJavaOptions_"${type} +driver_cores="driverCores_"${type} +driver_memory="driverMemory_"${type} +master_="master" +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} + +echo ${cpu_name} +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + + + +if [ ${is_raw} == "yes" ]; then + driver_max_result_size="driverMaxResultSize_"${type} + epsilon="epsilon_"${dataset_name}_${type} + min_points="minPoints_"${dataset_name}_${type} + + driver_max_result_size_val=${!driver_max_result_size} + epsilon_val=${!epsilon} + min_points_val=${!min_points} + + echo "${driver_max_result_size} : ${driver_max_result_size_val}" + echo "${epsilon} : ${epsilon_val}" + echo "${min_points} : ${min_points_val}" + + + if [ ! ${driver_max_result_size_val} ] \ + || [ ! ${epsilon_val} ] \ + || [ ! ${min_points_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 + fi +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + +outputPath="${save_resultPath_val}/dbscan/alitoukaDBSCAN/output_${dataset_name}" +hdfsJarPath="hdfs:///tmp/ml/test/dbscan" + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +mkdir -p log +echo "start to submit spark jobs --- DBSCAN-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class org.apache.spark.ml.clustering.DBSCANRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.task.maxFailures=1" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} | tee ./log/log +else + hdfs dfs -rm -r -f ${outputPath} + hdfs dfs -mkdir -p ${hdfsJarPath} + hdfs dfs -ls ${hdfsJarPath} + if [ $? -eq 0 ];then + hdfs dfs -rm -r -f ${hdfsJarPath}/alitouka_dbscan_2.11-0.1.jar + hdfs dfs -put ./lib/alitouka_dbscan_2.11-0.1.jar ${hdfsJarPath} + fi + + spark-submit \ + --jars "lib/scopt_2.11-3.5.0.jar" \ + --class org.alitouka.spark.dbscan.DbscanDriver \ + --deploy-mode ${deploy_mode_val} \ + --name "alitouka_DBSCAN_${model_conf}" \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.driver.maxResultSize=${driver_max_result_size_val}" \ + ${hdfsJarPath}/alitouka_dbscan_2.11-0.1.jar --ds-master ${master_val} --ds-jar ${hdfsJarPath}/alitouka_dbscan_${scala_version_val}-0.1.jar --ds-input ${data_path_val} --ds-output ${outputPath} --eps ${epsilon_val} --numPts ${min_points_val} >dbscan_tmp.log + CostTime=$(cat dbscan_tmp.log | grep "train total" | awk '{print $3}') + currentTime=$(date "+%Y%m%d_%H%M%S") + rm -rf dbscan_tmp.log + echo -e "algorithmName: DBSCAN\ncostTime: ${CostTime}\ndatasetName: ${dataset_name}\nisRaw: 'yes'\ntestcaseType: DBSCAN_opensource_${dataset_name}\n" > ./report/"DBSCAN_${dataset_name}_raw_${currentTime}.yml" + echo "Exec Successful: costTime: ${CostTime}" > ./log/log +fi + diff --git a/tools/kal-test/bin/ml/dt_run.sh b/tools/kal-test/bin/ml/dt_run.sh new file mode 100644 index 0000000..1c5b79a --- /dev/null +++ b/tools/kal-test/bin/ml/dt_run.sh @@ -0,0 +1,145 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: type of algorithm: [classification/regression]" + echo "2nd argument: type of data structure: [dataframe/rdd]" + echo "3rd argument: name of dataset: [epsilon/higgs/mnist8m]" + echo "4th argument: name of API: [for dataframe: fit/fit1/fit2/fit3; for rdd: trainClassifier/trainRegressor]" + echo "5th argument: optimization algorithm or raw: [no/yes]" + echo "6th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 6 ]; then + usage + exit 0 +fi + +source conf/ml/dt/dt_spark.properties +algorithm_type=$1 +data_structure=$2 +dataset_name=$3 +api_name=$4 +is_raw=$5 +if_check=$6 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${algorithm_type}-${data_structure}-${dataset_name}-${api_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors=${cpu_name}_${algorithm_type}"_"${dataset_name}"_numExectuors" +executor_cores=${cpu_name}_${algorithm_type}"_"${dataset_name}"_executorCores" +executor_memory=${cpu_name}_${algorithm_type}"_"${dataset_name}"_executorMemory" +extra_java_options=${cpu_name}_${algorithm_type}"_"${dataset_name}"_extraJavaOptions" +driver_cores="driverCores" +driver_memory="driverMemory" +master_="master" +deploy_mode="deployMode" +max_failures="maxFailures" +compress_="compress" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} +max_failures_val=${!max_failures} +compress_val=${!compress_} + +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${max_failures} : ${max_failures_val}" +echo "${compress_} : ${compress_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ] \ + || [ ! ${max_failures_val} ] \ + || [ ! ${compress_val} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + +spark_conf=${master_val}_${deploy_mode_val}_${num_executors_val}_${executor_cores_val}_${executor_memory_val} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- dt-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar lib/fastutil-8.3.1.jar root@agent1:/opt/ml_classpath/ + scp lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar lib/fastutil-8.3.1.jar root@agent2:/opt/ml_classpath/ + scp lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar lib/fastutil-8.3.1.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.DTRunner \ + --driver-java-options "-Xms15g" \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.taskmaxFailures=${max_failures_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val} | tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.DTRunner \ + --driver-java-options "-Xms15g" \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.taskmaxFailures=${max_failures_val}" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val} | tee ./log/log +fi diff --git a/tools/kal-test/bin/ml/dtb_run.sh b/tools/kal-test/bin/ml/dtb_run.sh new file mode 100644 index 0000000..f18aa27 --- /dev/null +++ b/tools/kal-test/bin/ml/dtb_run.sh @@ -0,0 +1,148 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: higgs/mnist8m" + echo "2nd argument: name of API: fit/fit1/fit2/fit3" + echo "3rd argument: save or verify result: save/verify" + echo "4th argument: optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 4 ]; then + usage + exit 0 +fi + +source conf/ml/dtb/dtb_spark.properties +dataset_name=$1 +api_name=$2 +verify=$3 +is_raw=$4 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +# concatnate strings as a new variable +num_executors="numExectuors_"${dataset_name}_${cpu_name} +executor_cores="executorCores_"${dataset_name}_${cpu_name} +executor_memory="executorMemory_"${dataset_name}_${cpu_name} +extra_java_options="extraJavaOptions_"${dataset_name}_${cpu_name} +driver_cores="driverCores" +driver_memory="driverMemory" +master_="master" +deploy_mode="deployMode" +max_failures="maxFailures" +compress_="compress" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} +max_failures_val=${!max_failures} +compress_val=${!compress_} + +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${max_failures} : ${max_failures_val}" +echo "${compress_} : ${compress_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ] \ + || [ ! ${max_failures_val} ] \ + || [ ! ${compress_val} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + +bucketedResPath="/tmp/ml/res/DTB_ref_bucketedRes/${is_raw}/${spark_version_val}/${dataset_name}" +hdfs dfs -mkdir -p ${bucketedResPath} + +spark_conf=${master_val}_${deploy_mode_val}_${num_executors_val}_${executor_cores_val}_${executor_memory_val} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +model_conf=${dataset_name}-${api_name}-${verify}-${bucketedResPath} +echo "start to submit spark jobs --- dtb-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.DTBRunner \ + --driver-java-options "-Xms15g" \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.taskmaxFailures=${max_failures_val}" \ + --jars "lib/snakeyaml-1.19.jar,lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${is_raw} ${spark_conf} | tee ./log/log +else + scp lib/boostkit-ml-kernel-client_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar root@agent1:/opt/ml_classpath/ + scp lib/boostkit-ml-kernel-client_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar root@agent2:/opt/ml_classpath/ + scp lib/boostkit-ml-kernel-client_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.DTBRunner \ + --driver-java-options "-Xms15g" \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.taskmaxFailures=${max_failures_val}" \ + --jars "lib/snakeyaml-1.19.jar,lib/boostkit-ml-kernel-client-${scala_version_val}-${kal_version_val}-${spark_version_val}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${is_raw} ${spark_conf} | tee ./log/log +fi diff --git a/tools/kal-test/bin/ml/encoder_run.sh b/tools/kal-test/bin/ml/encoder_run.sh new file mode 100644 index 0000000..3a99061 --- /dev/null +++ b/tools/kal-test/bin/ml/encoder_run.sh @@ -0,0 +1,134 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1rd argument: name of dataset: e.g. encoder_400m,encoder_800m" + echo "2th argument: optimization algorithm or raw: [no/yes]" + echo "3th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 +esac + +if [ $# -ne 3 ]; then + usage + exit 0 +fi + + +source conf/ml/encoder/encoder_spark.properties +dataset_name=$1 +is_raw=$2 +if_check=$3 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors=${cpu_name}_${dataset_name}"_numExecutors" +executor_cores=${cpu_name}_${dataset_name}"_executorCores" +executor_memory=${cpu_name}_${dataset_name}"_executorMemory" +executor_extra_java_options=${cpu_name}_${dataset_name}"_extraJavaOptions" +driver_cores=${cpu_name}_${dataset_name}"_driverCores" +driver_memory=${cpu_name}_${dataset_name}"_driverMemory" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +executor_extra_java_options_val=${!executor_extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} + +echo "master : ${master}" +echo "deployMode : ${deployMode}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${executor_extra_java_options} : ${executor_extra_java_options_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${executor_extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master} ] \ + || [ ! ${deployMode} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" +mkdir -p /data/data1/tmp/encoder +localSavePath=/data/data1/tmp/encoder +path_conf=${data_path_val},${localSavePath} + +echo "start to clean cache and sleep 3s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 3 + +echo "start to submit spark jobs --- encoder-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + mkdir -p log + spark-submit \ + --class com.bigdata.ml.EncoderRunner \ + --master ${master} \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --conf spark.executor.extraJavaOptions=${executor_extra_java_options_val} \ + --conf spark.rdd.compress=false \ + --conf spark.eventLog.enabled=true \ + --conf spark.driver.maxResultSize=40g \ + --conf spark.network.timeout=60s \ + --conf "spark.driver.extraJavaOptions=-Xss5g -Dlog4j.configuration=file:./log4j.properties" \ + --driver-java-options "-Xms15g" \ + --driver-class-path "lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --jars "lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${path_conf} ${cpu_name} ${save_resultPath_val}| tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.EncoderRunner \ + --master ${master} \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --conf spark.executor.extraJavaOptions=${executor_extra_java_options_val} \ + --conf spark.rdd.compress=false \ + --conf spark.eventLog.enabled=true \ + --conf spark.driver.maxResultSize=40g \ + --conf spark.network.timeout=60s \ + --driver-class-path "lib/snakeyaml-1.19.jar" \ + --conf "spark.driver.extraJavaOptions=-Xss5g -Dlog4j.configuration=file:./log4j.properties" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${path_conf} ${cpu_name} ${save_resultPath_val}| tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/fm_run.sh b/tools/kal-test/bin/ml/fm_run.sh new file mode 100644 index 0000000..8ae5603 --- /dev/null +++ b/tools/kal-test/bin/ml/fm_run.sh @@ -0,0 +1,133 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: type of algorithm: [classification/regression]" + echo "2nd argument: name of dataset: [epsilon/higgs/avazu/kdda]" + echo "3rd argument: name of API: [fit/fit1/fit2/fit3]" + echo "4th argument: optimization algorithm or raw: [no/yes]" + echo "5th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 5 ]; then + usage + exit 0 +fi + +source conf/ml/fm/fm_spark.properties +algorithm_type=$1 +dataset_name=$2 +api_name=$3 +is_raw=$4 +if_check=$5 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${algorithm_type}-${dataset_name}-${api_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors=${cpu_name}_${algorithm_type}_${dataset_name}"_numExectuors" +executor_cores=${cpu_name}_${algorithm_type}_${dataset_name}"_executorCores" +executor_memory=${cpu_name}_${algorithm_type}_${dataset_name}"_executorMemory" +extra_java_options=${cpu_name}_${algorithm_type}_${dataset_name}"_extraJavaOptions" +driver_cores="driverCores" +driver_memory="driverMemory" +master_="master" +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} + +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + +spark_conf=${master_val}_${deploy_mode_val}_${num_executors_val}_${executor_cores_val}_${executor_memory_val} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- fm-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/json4s-ext_2.12-3.2.11.jar lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/json4s-ext_2.12-3.2.11.jar lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/json4s-ext_2.12-3.2.11.jar lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.FMRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --driver-java-options "-Dlog4j.configuration=file:./log4j.properties -Dhdp.version=3.1.0.0-78" \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.network.timeout=3600s" \ + --jars "lib/json4s-ext_2.12-3.2.11.jar,lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val}| tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.FMRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.network.timeout=3600s" \ + --conf "spark.driver.maxResultSize=2G" \ + --driver-class-path "lib/json4s-ext_2.12-3.2.11.jar:lib/snakeyaml-1.19.jar:lib/fastutil-8.3.1.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/fpg_run.sh b/tools/kal-test/bin/ml/fpg_run.sh new file mode 100644 index 0000000..e5e7f47 --- /dev/null +++ b/tools/kal-test/bin/ml/fpg_run.sh @@ -0,0 +1,124 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1rd argument: name of dataset: [Kosarak,Kosarak25,IBM700]" + echo "2th argument: optimization algorithm or raw: [no/yes]" + echo "3th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 3 ]; then + usage + exit 0 +fi + +source conf/ml/fpg/fpg_spark.properties +dataset_name=$1 +is_raw=$2 +if_check=$3 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors=${cpu_name}_${dataset_name}"_numExecutors" +executor_cores=${cpu_name}_${dataset_name}"_executorCores" +executor_memory=${cpu_name}_${dataset_name}"_executorMemory" +executor_extra_java_options=${cpu_name}_${dataset_name}"_extraJavaOptions" +driver_cores=${cpu_name}_${dataset_name}"_driverCores" +driver_memory=${cpu_name}_${dataset_name}"_driverMemory" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +executor_extra_java_options_val=${!executor_extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} + +echo "master : ${master}" +echo "deployMode : ${deployMode}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${executor_extra_java_options} : ${executor_extra_java_options_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${executor_extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master} ] \ + || [ ! ${deployMode} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path=${!dataset_name} + +echo "${dataset_name} : ${data_path}" + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- fpg-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.FPGRunner \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master} \ + --conf "spark.executor.extraJavaOptions=${executor_extra_java_options_val}" \ + --driver-java-options "-Dlog4j.configuration=file:./log4j.properties" \ + --conf "spark.task.maxFailures=1" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.FPGRunner \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master} \ + --conf "spark.executor.extraJavaOptions=${executor_extra_java_options_val}" \ + --conf "spark.task.maxFailures=1" \ + --driver-class-path "lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/gbdt_run.sh b/tools/kal-test/bin/ml/gbdt_run.sh new file mode 100644 index 0000000..b089797 --- /dev/null +++ b/tools/kal-test/bin/ml/gbdt_run.sh @@ -0,0 +1,132 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: type of algorithm: [classification/regression]" + echo "2nd argument: type of data structure: [dataframe/rdd]" + echo "3rd argument: name of dataset: [epsilon/rcv/D10M4096libsvm]" + echo "4th argument: name of API: [for dataframe: fit/fit1/fit2/fit3; for rdd: rdd/javardd]" + echo "5th argument: optimization algorithm or raw: [no/yes]" + echo "6th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 6 ]; then + usage + exit 0 +fi + +source conf/ml/gbdt/gbdt_spark.properties +algorithm_type=$1 +data_structure=$2 +dataset_name=$3 +api_name=$4 +is_raw=$5 +if_check=$6 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${algorithm_type}-${data_structure}-${dataset_name}-${api_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors="numExectuors" +executor_cores=${dataset_name}"_executorCores_"${cpu_name} +executor_memory="executorMemory" +extra_java_options="extraJavaOptions" +driver_cores="driverCores" +driver_memory="driverMemory" +master_="master" +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} + +echo ${cpu_name} +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path=${!dataset_name} + +echo "${dataset_name} : ${data_path}" + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- gbdt-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.GBDTRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --conf spark.dynamicAllocation.enabled=false \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${save_resultPath_val} | tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.GBDTRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --driver-class-path "lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${save_resultPath_val} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/hdb_run.sh b/tools/kal-test/bin/ml/hdb_run.sh new file mode 100644 index 0000000..644c72b --- /dev/null +++ b/tools/kal-test/bin/ml/hdb_run.sh @@ -0,0 +1,124 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1rd argument: name of dataset:Hibench1m_100,Hibench1m_200 " + echo "2th argument: optimization algorithm or raw: [no/yes]" + echo "3th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 3 ]; then + usage + exit 0 +fi + +source conf/ml/hdb/hdb_spark.properties +dataset_name=$1 +is_raw=$2 +if_check=$3 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors=${cpu_name}_${dataset_name}"_numExecutors" +executor_cores=${cpu_name}_${dataset_name}"_executorCores" +executor_memory=${cpu_name}_${dataset_name}"_executorMemory" +executor_extra_java_options=${cpu_name}_${dataset_name}"_extraJavaOptions" +driver_cores=${cpu_name}_${dataset_name}"_driverCores" +driver_memory=${cpu_name}_${dataset_name}"_driverMemory" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +executor_extra_java_options_val=${!executor_extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} + +echo "master : ${master}" +echo "deployMode : ${deployMode}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${executor_extra_java_options} : ${executor_extra_java_options_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${executor_extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master} ] \ + || [ ! ${deployMode} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path=${!dataset_name} + +echo "${dataset_name} : ${data_path}" + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- hdb-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.HDBRunner \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master} \ + --conf "spark.rpc.message.maxSize=1024" \ + --conf "spark.driver.maxResultSize=4g" \ + --conf "spark.task.maxFailures=100" \ + --conf "spark.executor.extraJavaOptions=${executor_extra_java_options_val}" \ + --driver-java-options "-Dlog4j.configuration=file:./log4j.properties" \ + --driver-class-path "lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar:lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar:/opt/ml_classpath/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.HDBRunner \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master} \ + --conf "spark.executor.extraJavaOptions=${executor_extra_java_options_val}" \ + --driver-class-path "lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/idf_run.sh b/tools/kal-test/bin/ml/idf_run.sh new file mode 100644 index 0000000..ecc8c54 --- /dev/null +++ b/tools/kal-test/bin/ml/idf_run.sh @@ -0,0 +1,125 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1rd argument: name of dataset: e.g. D10m200m" + echo "2th argument: optimization algorithm or raw: [no/yes]" + echo "3th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 +esac + +if [ $# -ne 3 ]; then + usage + exit 0 +fi + + +source conf/ml/idf/idf_spark.properties +dataset_name=$1 +is_raw=$2 +if_check=$3 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors=${cpu_name}_${dataset_name}"_numExecutors" +executor_cores=${cpu_name}_${dataset_name}"_executorCores" +executor_memory=${cpu_name}_${dataset_name}"_executorMemory" +executor_extra_java_options=${cpu_name}_${dataset_name}"_extraJavaOptions" +driver_cores=${cpu_name}_${dataset_name}"_driverCores" +driver_memory=${cpu_name}_${dataset_name}"_driverMemory" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +executor_extra_java_options_val=${!executor_extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} + +echo "master : ${master}" +echo "deployMode : ${deployMode}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${executor_extra_java_options} : ${executor_extra_java_options_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${executor_extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master} ] \ + || [ ! ${deployMode} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + + +echo "start to clean cache and sleep 3s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 3 + +echo "start to submit spark jobs --- idf-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + mkdir -p log + spark-submit \ + --class com.bigdata.ml.IDFRunner \ + --master ${master} \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --conf spark.executor.extraJavaOptions=${executor_extra_java_options_val} \ + --driver-java-options "-Xms15g" \ + --driver-class-path "lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --jars "lib/snakeyaml-1.19.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val}| tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.IDFRunner \ + --master ${master} \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --conf spark.executor.extraJavaOptions=${executor_extra_java_options_val} \ + --driver-java-options "-Xms15g" \ + --driver-class-path "lib/snakeyaml-1.19.jar" \ + --jars "lib/snakeyaml-1.19.jar,lib/fastutil-8.3.1.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val}| tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/if_run.sh b/tools/kal-test/bin/ml/if_run.sh new file mode 100644 index 0000000..973ae80 --- /dev/null +++ b/tools/kal-test/bin/ml/if_run.sh @@ -0,0 +1,121 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1rd argument: name of dataset: [if_40M_1k/if_1M_1k]" + echo "2th argument: optimization algorithm or raw: [no/yes]" + echo "3th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 3 ]; then + usage + exit 0 +fi + +source conf/ml/if/if_spark.properties +dataset_name=$1 +is_raw=$2 +if_check=$3 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors=${cpu_name}_${dataset_name}"_numExecutors" +executor_cores=${cpu_name}_${dataset_name}"_executorCores" +executor_memory=${cpu_name}_${dataset_name}"_executorMemory" +driver_cores=${cpu_name}_${dataset_name}"_driverCores" +driver_memory=${cpu_name}_${dataset_name}"_driverMemory" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} + +echo "master : ${master}" +echo "deployMode : ${deployMode}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master} ] \ + || [ ! ${deployMode} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path=${!dataset_name} + +echo "${dataset_name} : ${data_path}" + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- if-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.IFRunner \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master} \ + --driver-java-options "-Xms15g -Dlog4j.configuration=file:./log4j.properties" \ + --conf "spark.driver.maxResultSize=2g" \ + --conf "spark.sophon.isolationForest.parLevel=100" \ + --jars "lib/isolation-forest_3.1.1_2.12-2.0.8.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/isolation-forest_3.1.1_2.12-2.0.8.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/kal-test_${scala_version_val}-0.1.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.IFRunner \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master} \ + --driver-java-options "-Xms15g -Dlog4j.configuration=file:./log4j.properties" \ + --conf "spark.driver.maxResultSize=2g" \ + --driver-class-path "lib/isolation-forest_3.1.1_2.12-2.0.8.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar" \ + --jars "lib/isolation-forest_3.1.1_2.12-2.0.8.jar,lib/boostkit-ml-kernel-client-${scala_version_val}-${kal_version_val}-${spark_version_val}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/kmeans_run.sh b/tools/kal-test/bin/ml/kmeans_run.sh new file mode 100644 index 0000000..3d54c50 --- /dev/null +++ b/tools/kal-test/bin/ml/kmeans_run.sh @@ -0,0 +1,137 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: type of data structure: [dataframe/rdd]" + echo "2nd argument: name of dataset: e.g. D200M20" + echo "3rd argument: name of API: e.g. fit/fit1/fit2/fit3" + echo "4th argument: optimization algorithm or raw: [no/yes]" + echo "5th argument: Whether to Compare Results [no/yes]" +} + + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 5 ]; then + usage + exit 0 +fi + +source conf/ml/kmeans/kmeans_spark.properties +data_structure=$1 +dataset_name=$2 +api_name=$3 +is_raw=$4 +if_check=$5 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${data_structure}-${dataset_name}-${api_name}-${cpu_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors="numExectuors_"${cpu_name} +executor_cores="executorCores_"${cpu_name} +executor_memory="executorMemory_"${cpu_name} +extra_java_options="extraJavaOptions_"${cpu_name} +driver_cores="driverCores_"${cpu_name} +driver_memory="driverMemory_"${cpu_name} +driver_java_options="driverJavaOptions_"${cpu_name} +master_="master" +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_java_options_val=${!driver_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} + +echo ${cpu_name} +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${driver_java_options} : ${driver_java_options_val}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + +echo "start to submit spark jobs --- KMeans-${model_conf}" + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- kmeans-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/mahout-core-0.9.jar lib/mahout-math-0.9.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/mahout-core-0.9.jar lib/mahout-math-0.9.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/mahout-core-0.9.jar lib/mahout-math-0.9.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.KMeansRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --driver-java-options ${driver_java_options_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/mahout-core-0.9.jar,lib/mahout-math-0.9.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/mahout-math-0.9.jar:lib/mahout-core-0.9.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/mahout-core-0.9.jar:/opt/ml_classpath/mahout-math-0.9.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${save_resultPath_val}| tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.KMeansRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --driver-java-options ${driver_java_options_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --jars "lib/mahout-core-0.9.jar,lib/mahout-math-0.9.jar" \ + --driver-class-path "lib/mahout-math-0.9.jar:lib/mahout-core-0.9.jar:lib/snakeyaml-1.19.jar" \ + --conf "spark.executor.extraClassPath=lib/mahout-core-0.9.jar:lib/mahout-math-0.9.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${save_resultPath_val}| tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/knn_run.sh b/tools/kal-test/bin/ml/knn_run.sh new file mode 100644 index 0000000..a1ad21d --- /dev/null +++ b/tools/kal-test/bin/ml/knn_run.sh @@ -0,0 +1,145 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: e.g. glove/gist/deep1b" + echo "2th argument: optimization algorithm or raw: [no/yes]" + echo "if u want to compare result, pls execute {./bin/compare/ml/KNNVerify.sh } " +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 2 ]; then + usage + exit 0 +fi + + +source conf/ml/knn/knn_spark.properties +dataset_name=$1 +is_raw=$2 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${is_raw} + +# concatnate strings as a new variable +num_executors="numExectuors_"${dataset_name}_${cpu_name} +executor_cores="executorCores_"${dataset_name}_${cpu_name} +executor_memory="executorMemory_"${dataset_name}_${cpu_name} +extra_java_options="extraJavaOptions_"${dataset_name}_${cpu_name} +driver_cores="driverCores_"${dataset_name}_${cpu_name} +driver_memory="driverMemory_"${dataset_name}_${cpu_name} +memory_overhead="execMemOverhead_"${dataset_name}_${cpu_name} +master_="master" +deploy_mode="deployMode" +compress_="compress" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +memory_overhead_val=${!memory_overhead} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} +compress_val=${!compress_} + + +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${memory_overhead} : ${memory_overhead_val}" +echo "${compress_} : ${compress_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ] \ + || [ ! ${memory_overhead_val} ] \ + || [ ! ${compress_val} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + +spark_conf=${master_val}_${deploy_mode_val}_${num_executors_val}_${executor_cores_val}_${executor_memory_val} + +mkdir -p log + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- KNN-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.KNNRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.executor.memory_overhead=${memory_overhead_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --conf "spark.rdd.compress=${compress_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val} | tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.KNNRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.executor.memory_overhead=${memory_overhead_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --conf "spark.rdd.compress=${compress_val}" \ + --driver-class-path "lib/snakeyaml-1.19.jar:lib/fastutil-8.3.1.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val} | tee ./log/log +fi diff --git a/tools/kal-test/bin/ml/lda_run.sh b/tools/kal-test/bin/ml/lda_run.sh new file mode 100644 index 0000000..8e15ba5 --- /dev/null +++ b/tools/kal-test/bin/ml/lda_run.sh @@ -0,0 +1,132 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: type of data structure: [dataframe/rdd]" + echo "2nd argument: name of dataset: e.g. nytimes/pubmed/D20M200K" + echo "3rd argument: name of API: e.g. fit/fit1/fit2/fit3/run" + echo "4th argument: optimization algorithm or raw: [no/yes]" + echo "5th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 5 ]; then + usage + exit 0 +fi + +source conf/ml/lda/lda_spark.properties +data_structure=$1 +dataset_name=$2 +api_name=$3 +is_raw=$4 +if_check=$5 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${data_structure}-${dataset_name}-${api_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors="numExectuors_"${cpu_name} +executor_cores="executorCores_"${cpu_name} +executor_memory="executorMemory_"${cpu_name} +extra_java_options="extraJavaOptions_"${cpu_name} +driver_cores="driverCores_"${cpu_name} +driver_memory="driverMemory_"${cpu_name} +master_="master" +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} + +echo ${cpu_name} +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +mkdir -p log +echo "start to submit spark jobs --- LDA-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.LDARunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.driver.cores=${driver_cores_val}" \ + --conf "spark.task.cpus=${executor_cores_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --jars "lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val} | tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.LDARunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --driver-class-path "lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/lgbm_run.sh b/tools/kal-test/bin/ml/lgbm_run.sh new file mode 100644 index 0000000..b9e8622 --- /dev/null +++ b/tools/kal-test/bin/ml/lgbm_run.sh @@ -0,0 +1,131 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: type of algorithm: [classification/regression]" + echo "2nd argument: name of dataset:mnist8m, higgs,criteo " + echo "3rd argument: optimization algorithm or raw: [no/yes]" + echo "4th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 4 ]; then + usage + exit 0 +fi + +source conf/ml/lgbm/lgbm_spark.properties +algorithm_type=$1 +dataset_name=$2 +is_raw=$3 +if_check=$4 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${algorithm_type}-${dataset_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors=${cpu_name}_${dataset_name}"_numExecutors" +executor_cores=${cpu_name}_${dataset_name}"_executorCores" +executor_memory=${cpu_name}_${dataset_name}"_executorMemory" +executor_extra_java_options=${cpu_name}_${dataset_name}"_extraJavaOptions" +executor_memory_overhead=${cpu_name}_${dataset_name}"_executorMemOverhead" +driver_cores=${cpu_name}_${dataset_name}"_driverCores" +driver_memory=${cpu_name}_${dataset_name}"_driverMemory" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +executor_extra_java_options_val=${!executor_extra_java_options} +executor_memory_overhead_val=${!executor_memory_overhead} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} + +echo "master : ${master}" +echo "deployMode : ${deployMode}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${executor_memory_overhead} : ${executor_memory_overhead_val}" +echo "${executor_extra_java_options} : ${executor_extra_java_options_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${executor_memory_overhead_val} ] \ + || [ ! ${executor_extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master} ] \ + || [ ! ${deployMode} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path=${!dataset_name} + +echo "${dataset_name} : ${data_path}" + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- lgbm-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/lightgbmlib.jar lib/mmlspark_${scala_version_val}_spark3.1.2-0.0.0+79-09152193.jar lib/boostkit-lightgbm-kernel_${scala_version_val}-1.3.0.jar lib/fastutil-8.3.1.jar root@agent1:/opt/ml_classpath/ + scp lib/lightgbmlib.jar lib/mmlspark_${scala_version_val}_spark3.1.2-0.0.0+79-09152193.jar lib/boostkit-lightgbm-kernel_${scala_version_val}-1.3.0.jar lib/fastutil-8.3.1.jar root@agent2:/opt/ml_classpath/ + scp lib/lightgbmlib.jar lib/mmlspark_${scala_version_val}_spark3.1.2-0.0.0+79-09152193.jar lib/boostkit-lightgbm-kernel_${scala_version_val}-1.3.0.jar lib/fastutil-8.3.1.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.LightGBMRunner \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --conf "spark.executor.memoryOverhead=${executor_memory_overhead_val}" \ + --master ${master} \ + --files=lib/lib_lightgbm_close.so \ + --conf "spark.executor.extraJavaOptions=${executor_extra_java_options_val}" \ + --jars "lib/lightgbmlib.jar,lib/fastutil-8.3.1.jar,lib/mmlspark_${scala_version_val}_spark3.1.2-0.0.0+79-09152193.jar,lib/boostkit-lightgbm-kernel_${scala_version_val}-1.3.0.jar" \ + --driver-class-path "lib/lightgbmlib.jar:lib/mmlspark_${scala_version_val}_spark3.1.2-0.0.0+79-09152193.jar:lib/boostkit-lightgbm-kernel_${scala_version_val}-1.3.0.jar:lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/lightgbmlib.jar:/opt/ml_classpath/mmlspark_${scala_version_val}_spark3.1.2-0.0.0+79-09152193.jar:/opt/ml_classpath/boostkit-lightgbm-kernel_${scala_version_val}-1.3.0.jar:/opt/ml_classpath/fastutil-8.3.1.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.LightGBMRunner \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master} \ + --jars "lib/lightgbmlib.jar,lib/snakeyaml-1.19.jar,lib/fastutil-8.3.1.jar,lib/mmlspark_${scala_version_val}_spark3.1.2-0.0.0+79-09152193.jar" \ + --conf "spark.executor.extraJavaOptions=${executor_extra_java_options_val}" \ + --driver-class-path "lib/lightgbmlib.jar,lib/snakeyaml-1.19.jar,lib/mmlspark_${scala_version_val}_spark3.1.2-0.0.0+79-09152193.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/lightgbmlib.jar:/opt/ml_classpath/mmlspark_${scala_version_val}_spark3.1.2-0.0.0+79-09152193.jar:/opt/ml_classpath/fastutil-8.3.1.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/linR_run.sh b/tools/kal-test/bin/ml/linR_run.sh new file mode 100644 index 0000000..3a8182a --- /dev/null +++ b/tools/kal-test/bin/ml/linR_run.sh @@ -0,0 +1,146 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: e.g. mnist8m/Twitter/rcv" + echo "2nd argument: name of API: e.g. fit/fit1/fit2/fit3" + echo "3th argument: optimization algorithm or raw: [no/yes]" + echo "4th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 4 ]; then + usage + exit 0 +fi + +source conf/ml/linR/linR_spark.properties +dataset_name=$1 +api_name=$2 +is_raw=$3 +if_check=$4 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${api_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors="numExectuors_"${cpu_name} +executor_cores="executorCores_"${cpu_name} +executor_memory="executorMemory_"${cpu_name} +extra_java_options="extraJavaOptions_"${cpu_name} +driver_cores="driverCores_"${cpu_name} +driver_memory="driverMemory_"${cpu_name} +master_="master" +deploy_mode="deployMode" +max_failures="maxFailures" +compress_="compress" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} +max_failures_val=${!max_failures} +compress_val=${!compress_} + + +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${max_failures} : ${max_failures_val}" +echo "${compress_} : ${compress_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ] \ + || [ ! ${max_failures_val} ] \ + || [ ! ${compress_val} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + + +spark_conf=${master_val}_${deploy_mode_val}_${num_executors_val}_${executor_cores_val}_${executor_memory_val} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- LinR-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.LinRRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.taskmaxFailures=${max_failures_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --conf "spark.rdd.compress=${compress_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val} | tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.LinRRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.taskmaxFailures=${max_failures_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --conf "spark.rdd.compress=${compress_val}" \ + --driver-class-path "lib/snakeyaml-1.19.jar:lib/fastutil-8.3.1.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/logR_run.sh b/tools/kal-test/bin/ml/logR_run.sh new file mode 100644 index 0000000..7ef64f0 --- /dev/null +++ b/tools/kal-test/bin/ml/logR_run.sh @@ -0,0 +1,125 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: e.g. mnist8m/Twitter/rcv" + echo "2nd argument: name of API: e.g. fit/fit1/fit2/fit3" + echo "3th argument: optimization algorithm or raw: [no/yes]" + echo "4th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 4 ]; then + usage + exit 0 +fi + +source conf/ml/logR/logR_spark.properties +dataset_name=$1 +api_name=$2 +is_raw=$3 +if_check=$4 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${api_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors="numExectuors_"${cpu_name} +executor_cores="executorCores_"${cpu_name} +executor_memory="executorMemory_"${cpu_name} +extra_java_options="extraJavaOptions_"${cpu_name} +driver_cores="driverCores_"${cpu_name} +driver_memory="driverMemory_"${cpu_name} +master_="master" +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} + +echo ${cpu_name} +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- LogR-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.LogRRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val}| tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.LogRRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --driver-class-path "lib/snakeyaml-1.19.jar:lib/fastutil-8.3.1.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/nmf_run.sh b/tools/kal-test/bin/ml/nmf_run.sh new file mode 100644 index 0000000..6fe7376 --- /dev/null +++ b/tools/kal-test/bin/ml/nmf_run.sh @@ -0,0 +1,128 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1rd argument: name of dataset:CSJ, MT, Books, HibenchRating50mx10mx500m " + echo "2th argument: optimization algorithm or raw: [no/yes]" + echo "3th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 3 ]; then + usage + exit 0 +fi + +source conf/ml/nmf/nmf_spark.properties +dataset_name=$1 +is_raw=$2 +if_check=$3 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors=${cpu_name}_${dataset_name}"_numExecutors" +executor_cores=${cpu_name}_${dataset_name}"_executorCores" +executor_memory=${cpu_name}_${dataset_name}"_executorMemory" +executor_memory_overhead=${cpu_name}_${dataset_name}"_executorMemOverhead" +executor_extra_java_options=${cpu_name}_${dataset_name}"_extraJavaOptions" +driver_cores=${cpu_name}_${dataset_name}"_driverCores" +driver_memory=${cpu_name}_${dataset_name}"_driverMemory" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +executor_memory_overhead_val=${!executor_memory_overhead} +executor_extra_java_options_val=${!executor_extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} + +echo "master : ${master}" +echo "deployMode : ${deployMode}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${executor_memory_overhead} : ${executor_memory_overhead_val}" +echo "${executor_extra_java_options} : ${executor_extra_java_options_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${executor_memory_overhead_val} ] \ + || [ ! ${executor_extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master} ] \ + || [ ! ${deployMode} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path=${!dataset_name} + +echo "${dataset_name} : ${data_path}" + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- nmf-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.NMFRunner \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master} \ + --conf "spark.executor.memoryOverhead=${executor_memory_overhead_val}" \ + --conf "spark.executor.extraJavaOptions=${executor_extra_java_options_val} -Xss512m" \ + --driver-java-options "-Dlog4j.configuration=file:./log4j.properties" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.NMFRunner \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master} \ + --conf "spark.executor.extraJavaOptions=${executor_extra_java_options_val} -Xss512m" \ + --driver-class-path "lib/snakeyaml-1.19.jar" \ + --jars "lib/boostkit-ml-kernel-client-${scala_version_val}-${kal_version_val}-${spark_version_val}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path} ${cpu_name} ${save_resultPath_val} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/pca_run.sh b/tools/kal-test/bin/ml/pca_run.sh new file mode 100644 index 0000000..76d6db3 --- /dev/null +++ b/tools/kal-test/bin/ml/pca_run.sh @@ -0,0 +1,136 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: type of data structure: [dataframe/rdd]" + echo "2nd argument: name of dataset: e.g. D10M1K/D1M10K/MESH" + echo "3rd argument: name of API:[for dataframe: fit/fit1/fit2/fit3; for rdd: train]" + echo "4th argument: optimization algorithm or raw: [no/yes]" + echo "5th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 5 ]; then + usage + exit 0 +fi + +source conf/ml/pca/pca_spark.properties +data_structure=$1 +dataset_name=$2 +api_name=$3 +is_raw=$4 +if_check=$5 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${data_structure}-${dataset_name}-${api_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors=${dataset_name}"_numExectuors_"${cpu_name} +executor_cores=${dataset_name}"_executorCores_"${cpu_name} +executor_memory=${dataset_name}"_executorMemory_"${cpu_name} +extra_java_options=${dataset_name}"_extraJavaOptions_"${cpu_name} +driver_cores=${dataset_name}"_driverCores_"${cpu_name} +driver_memory=${dataset_name}"_driverMemory_"${cpu_name} +executor_memory_overhead=${dataset_name}"_executorMemoryOverhead_"${cpu_name} +master_="master" +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +executor_memory_overhead_val=${!executor_memory_overhead} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} + +echo ${cpu_name} +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${executor_memory_overhead} : ${executor_memory_overhead_val}" +echo "${extra_java_options} : ${extra_java_options_val}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${executor_memory_overhead_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- PCA-${model_conf}" + +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.PCARunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.driver.maxResultSize=256G" \ + --conf "spark.executor.memoryOverhead=${executor_memory_overhead_val}" \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val}| tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.PCARunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.driver.maxResultSize=256G" \ + --conf "spark.executor.memoryOverhead=${executor_memory_overhead_val}" \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --driver-class-path "lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val}| tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/pearson_run.sh b/tools/kal-test/bin/ml/pearson_run.sh new file mode 100644 index 0000000..c5a4ddc --- /dev/null +++ b/tools/kal-test/bin/ml/pearson_run.sh @@ -0,0 +1,144 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: type of data structure: [dataframe/rdd]" + echo "2nd argument: name of dataset: e.g. CP10M1K/CP2M5K/CP1M10K" + echo "3nd argument: optimization algorithm or raw: [no/yes]" + echo "4rd argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 4 ]; then + usage + exit 0 +fi + +source conf/ml/pearson/pearson_spark.properties +data_structure=$1 +dataset_name=$2 +is_raw=$3 +if_check=$4 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${data_structure}-${dataset_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors="numExectuors_"${cpu_name} +executor_cores="executorCores_"${cpu_name} +executor_memory="executorMemory_"${cpu_name} +extra_java_options="extraJavaOptions_"${cpu_name} +driver_cores="driverCores_"${cpu_name} +driver_memory="driverMemory_"${cpu_name} +memory_overhead="execMemOverhead_"${cpu_name} +master_="master" +deploy_mode="deployMode" +dataset_output_=${dataset_name}"_output" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +memory_overhead_val=${!memory_overhead} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} + + +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${memory_overhead} : ${memory_overhead_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${memory_overhead_val} ] \ + || [ ! ${master_val} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + +hdfs dfs -rm -r -f "${data_path_output}_${cpu_name}_${is_raw}" +spark_conf=${master_val}_${deploy_mode_val}_${num_executors_val}_${executor_cores_val}_${executor_memory_val} + +mkdir -p log + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- Pearson-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.PearsonRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.executor.memoryOverhead=${memory_overhead_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val} | tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.PearsonRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.executor.memoryOverhead=${memory_overhead_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --driver-class-path "lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar" \ + --jars "lib/snakeyaml-1.19.jar,lib/fastutil-8.3.1.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val}| tee ./log/log +fi diff --git a/tools/kal-test/bin/ml/ps_run.sh b/tools/kal-test/bin/ml/ps_run.sh new file mode 100644 index 0000000..85f4324 --- /dev/null +++ b/tools/kal-test/bin/ml/ps_run.sh @@ -0,0 +1,135 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: e.g. kosarak/IBM10M47/IBM100M47" + echo "2nd argument: optimization algorithm or raw: [no/yes]" + echo "3rd argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 3 ]; then + usage + exit 0 +fi + +source conf/ml/ps/ps_spark.properties +dataset_name=$1 +is_raw=$2 +if_check=$3 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors="numExectuors_"${cpu_name} +executor_cores="executorCores_"${cpu_name} +executor_memory="executorMemory_"${cpu_name} +extra_java_options="extraJavaOptions_"${cpu_name} +driver_cores="driverCores_"${cpu_name} +driver_memory="driverMemory_"${cpu_name} +master_="master" +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} + +echo ${cpu_name} +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +mkdir -p log +echo "start to submit spark jobs --- PrefixSpan-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.PrefixSpanRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.rdd.compress=false" \ + --conf "spark.network.timeout=600s" \ + --conf "spark.driver.maxResultSize=256G" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val}| tee ./log/log +else + scp lib/fastutil-8.3.1.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.PrefixSpanRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.rdd.compress=false" \ + --conf "spark.network.timeout=600s" \ + --conf "spark.driver.maxResultSize=256G" \ + --driver-class-path "lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar" \ + --jars "lib/fastutil-8.3.1.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val}| tee ./log/log +fi diff --git a/tools/kal-test/bin/ml/rf_run.sh b/tools/kal-test/bin/ml/rf_run.sh new file mode 100644 index 0000000..14da3c4 --- /dev/null +++ b/tools/kal-test/bin/ml/rf_run.sh @@ -0,0 +1,149 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: type of algorithm: [classification/regression]" + echo "2nd argument: type of data structure: [dataframe/rdd]" + echo "3rd argument: name of dataset: [epsilon/higgs/mnist8m/rcv]" + echo "4th argument: name of API: [for dataframe: fit/fit1/fit2/fit3; for rdd: train/train1/train2]" + echo "5th argument: optimization algorithm or raw: [no/yes]" + echo "6th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 6 ]; then + usage + exit 0 +fi + +source conf/ml/rf/rf_spark.properties +algorithm_type=$1 +data_structure=$2 +dataset_name=$3 +api_name=$4 +is_raw=$5 +if_check=$6 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${algorithm_type}-${data_structure}-${dataset_name}-${api_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors=${cpu_name}_${algorithm_type}_${dataset_name}"_numExectuors" +executor_cores=${cpu_name}_${algorithm_type}_${dataset_name}"_executorCores" +executor_memory=${cpu_name}_${algorithm_type}_${dataset_name}"_executorMemory" +extra_java_options=${cpu_name}_${algorithm_type}_${dataset_name}"_extraJavaOptions" +driver_cores="driverCores" +driver_memory="driverMemory" +master_="master" +deploy_mode="deployMode" +max_failures="maxFailures" +compress_="compress" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} +max_failures_val=${!max_failures} +compress_val=${!compress_} + +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${max_failures} : ${max_failures_val}" +echo "${compress_} : ${compress_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ] \ + || [ ! ${max_failures_val} ] \ + || [ ! ${compress_val} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + +spark_conf=${master_val}_${deploy_mode_val}_${num_executors_val}_${executor_cores_val}_${executor_memory_val} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- rf-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.RFRunner \ + --driver-java-options "-Xms15g" \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.taskmaxFailures=${max_failures_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --conf "spark.rdd.compress=${compress_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val}| tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.RFRunner \ + --driver-java-options "-Xms15g" \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.taskmaxFailures=${max_failures_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --conf "spark.rdd.compress=${compress_val}" \ + --driver-class-path "lib/snakeyaml-1.19.jar:lib/fastutil-8.3.1.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/simrank_run.sh b/tools/kal-test/bin/ml/simrank_run.sh new file mode 100644 index 0000000..38d4dbd --- /dev/null +++ b/tools/kal-test/bin/ml/simrank_run.sh @@ -0,0 +1,134 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: e.g. simrank3w" + echo "2th argument: optimization algorithm or raw: [no/yes]" + echo "3th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 3 ]; then + usage + exit 0 +fi + +source conf/ml/simrank/simrank_spark.properties +dataset_name=$1 +is_raw=$2 +if_check=$3 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors="numExectuors_"${cpu_name} +executor_cores="executorCores_"${cpu_name} +executor_memory="executorMemory_"${cpu_name} +extra_java_options="extraJavaOptions_"${cpu_name} +driver_cores="driverCores_"${cpu_name} +driver_memory="driverMemory_"${cpu_name} +executor_memory_overhead="execMemOverhead_"${cpu_name} +master_="master" +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +executor_memory_overhead_val=${!executor_memory_overhead} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} + +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${executor_memory_overhead} : ${executor_memory_overhead_val}" +echo "${extra_java_options} : ${extra_java_options_val}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + +mkdir -p log +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- SimRank-${model_conf}" +if [[ ${is_raw} == "no" ]]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.SimRankRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.memoryOverhead=${executor_memory_overhead_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val}| tee ./log/log +else + scp lib/fastutil-8.3.1.jar lib/kal-test_${scala_version_val}-0.1.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/kal-test_${scala_version_val}-0.1.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/kal-test_${scala_version_val}-0.1.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.SimRankRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.memoryOverhead=${executor_memory_overhead_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/kal-test_${scala_version_val}-0.1.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/kal-test_${scala_version_val}-0.1.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val}| tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/spca_run.sh b/tools/kal-test/bin/ml/spca_run.sh new file mode 100644 index 0000000..c849fd1 --- /dev/null +++ b/tools/kal-test/bin/ml/spca_run.sh @@ -0,0 +1,148 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: e.g. Kemelmacher/mesh_deform/wathen100/MOLIERE" + echo "2th argument: name of API: e.g. fit/fit1/fit2/fit3" + echo "3th argument: optimization algorithm or raw: [no/yes]" + echo "4th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 4 ]; then + usage + exit 0 +fi + +source conf/ml/spca/spca_spark.properties +dataset_name=$1 +api_name=$2 +is_raw=$3 +if_check=$4 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${api_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors="numExectuors_"${dataset_name}_${cpu_name} +executor_cores="executorCores_"${dataset_name}_${cpu_name} +executor_memory="executorMemory_"${dataset_name}_${cpu_name} +extra_java_options="extraJavaOptions_"${dataset_name}_${cpu_name} +driver_cores="driverCores_"${dataset_name}_${cpu_name} +driver_memory="driverMemory_"${dataset_name}_${cpu_name} +memory_overhead="execMemOverhead_"${dataset_name}_${cpu_name} +master_="master" +deploy_mode="deployMode" +compress_="compress" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +memory_overhead_val=${!memory_overhead} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} +compress_val=${!compress_} + + +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${memory_overhead} : ${memory_overhead_val}" +echo "${compress_} : ${compress_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ] \ + || [ ! ${memory_overhead_val} ] \ + || [ ! ${compress_val} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + + +spark_conf=${master_val}_${deploy_mode_val}_${num_executors_val}_${executor_cores_val}_${executor_memory_val} + +mkdir -p log +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- SPCA-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.SPCARunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.executor.memory_overhead=${memory_overhead_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --conf "spark.rdd.compress=${compress_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val}| tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.SPCARunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --jars "lib/snakeyaml-1.19.jar,lib/boostkit-ml-kernel-client-${scala_version_val}-${kal_version_val}-${spark_version_val}.jar" \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.executor.memory_overhead=${memory_overhead_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --conf "spark.rdd.compress=${compress_val}" \ + --driver-class-path "lib/snakeyaml-1.19.jar:lib/fastutil-8.3.1.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val}| tee ./log/log +fi diff --git a/tools/kal-test/bin/ml/spearman_run.sh b/tools/kal-test/bin/ml/spearman_run.sh new file mode 100644 index 0000000..d23cc4b --- /dev/null +++ b/tools/kal-test/bin/ml/spearman_run.sh @@ -0,0 +1,145 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: type of data structure: [dataframe/rdd]" + echo "2nd argument: name of dataset: e.g. CP10M1K/CP2M5K/CP1M10K" + echo "3nd argument: optimization algorithm or raw: [no/yes]" + echo "4rd argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 4 ]; then + usage + exit 0 +fi + +source conf/ml/spearman/spearman_spark.properties +data_structure=$1 +dataset_name=$2 +is_raw=$3 +if_check=$4 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +model_conf=${data_structure}-${dataset_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors="numExectuors_"${dataset_name}_${cpu_name} +executor_cores="executorCores_"${dataset_name} +executor_memory="executorMemory_"${dataset_name}_${cpu_name} +executor_memory_overhead="executorMemOverhead_"${dataset_name}_${cpu_name} +extra_java_options="extraJavaOptions_"${dataset_name}_${cpu_name} +driver_cores="driverCores" +driver_memory="driverMemory" +master_="master" +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +executor_memory_overhead_val=${!executor_memory_overhead} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} + +echo ${cpu_name} +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${executor_memory_overhead} : ${executor_memory_overhead_val}" +echo "${extra_java_options} : ${extra_java_options_val}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${executor_memory_overhead_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +mkdir -p log +echo "start to submit spark jobs --- SpearMan-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.SpearManRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.memoryOverhead=${executor_memory_overhead_val}" \ + --conf "spark.driver.maxResultSize=256g" \ + --conf "spark.network.timeout=3600s" \ + --conf "spark.task.maxFailures=1" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val} | tee ./log/log +else + scp lib/fastutil-8.3.1.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.SpearManRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.memoryOverhead=${executor_memory_overhead_val}" \ + --conf "spark.driver.maxResultSize=256g" \ + --conf "spark.network.timeout=3600s" \ + --conf "spark.task.maxFailures=1" \ + --jars "lib/fastutil-8.3.1.jar" \ + --driver-class-path "lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val} | tee ./log/log +fi + diff --git a/tools/kal-test/bin/ml/svd_run.sh b/tools/kal-test/bin/ml/svd_run.sh new file mode 100644 index 0000000..638a6a0 --- /dev/null +++ b/tools/kal-test/bin/ml/svd_run.sh @@ -0,0 +1,147 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: e.g. D10M1K/D1M10K/MESH/RUCCI" + echo "2nd argument: optimization algorithm or raw: [no/yes]" + echo "3rd argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 3 ]; then + usage + exit 0 +fi + +source conf/ml/svd/svd_spark.properties +dataset_name=$1 +is_raw=$2 +if_check=$3 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors="numExectuors_"${dataset_name}_${cpu_name} +executor_cores="executorCores_"${dataset_name}_${cpu_name} +executor_memory="executorMemory_"${dataset_name}_${cpu_name} +extra_java_options="extraJavaOptions_"${dataset_name}_${cpu_name} +driver_cores="driverCores_"${dataset_name}_${cpu_name} +driver_memory="driverMemory_"${dataset_name}_${cpu_name} +memory_overhead="execMemOverhead_"${dataset_name}_${cpu_name} +master_="master" +deploy_mode="deployMode" +compress_="compress" + + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +memory_overhead_val=${!memory_overhead} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} +compress_val=${!compress_} + + +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${memory_overhead} : ${memory_overhead_val}" +echo "${compress_} : ${compress_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ] \ + || [ ! ${memory_overhead_val} ] \ + || [ ! ${compress_val} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + + +spark_conf=${master_val}_${deploy_mode_val}_${num_executors_val}_${executor_cores_val}_${executor_memory_val} + +mkdir -p log +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- SVD-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + + spark-submit \ + --class com.bigdata.ml.SVDRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.executor.memory_overhead=${memory_overhead_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --conf "spark.rdd.compress=${compress_val}" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val}| tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.SVDRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.executor.memory_overhead=${memory_overhead_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --conf "spark.rdd.compress=${compress_val}" \ + --driver-class-path "lib/snakeyaml-1.19.jar:lib/fastutil-8.3.1.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val}| tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/svm_run.sh b/tools/kal-test/bin/ml/svm_run.sh new file mode 100644 index 0000000..7427a1d --- /dev/null +++ b/tools/kal-test/bin/ml/svm_run.sh @@ -0,0 +1,130 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: e.g. ECBDL14/epsilon/rcv" + echo "2nd argument: name of API: e.g. fit/fit1/fit2/fit3" + echo "3th argument: optimization algorithm or raw: [no/yes]" + echo "4th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 4 ]; then + usage + exit 0 +fi + +source conf/ml/svm/svm_spark.properties +dataset_name=$1 +api_name=$2 +is_raw=$3 +if_check=$4 + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +model_conf=${dataset_name}-${api_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors=${cpu_name}"_numExectuors" +executor_cores=${cpu_name}"_executorCores" +executor_memory=${cpu_name}"_executorMemory" +extra_java_options=${cpu_name}"_extraJavaOptions" +driver_cores=${cpu_name}"_driverCores" +driver_memory=${cpu_name}"_driverMemory" +master_=${cpu_name}"_master" +deploy_mode=${cpu_name}"_deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} + +echo ${cpu_name} +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- SVM-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.SVMRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.LinearSVC.inertiaCofficient=0.5" \ + --jars "lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val} | tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.SVMRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val} -XX:SurvivorRatio=4 -XX:ParallelGCThreads=6" \ + --jars "lib/fastutil-8.3.1.jar" \ + --driver-class-path "lib/snakeyaml-1.19.jar" \ + --conf "spark.executor.extraClassPath=fastutil-8.3.1.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/te_run.sh b/tools/kal-test/bin/ml/te_run.sh new file mode 100644 index 0000000..a8f84a0 --- /dev/null +++ b/tools/kal-test/bin/ml/te_run.sh @@ -0,0 +1,132 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1rd argument: name of dataset: e.g. movielens, taobao, criteo40m, criteo150m" + echo "2th argument: optimization algorithm or raw: [no/yes]" + echo "3th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 +esac + +if [ $# -ne 3 ]; then + usage + exit 0 +fi + + +source conf/ml/te/te_spark.properties +dataset_name=$1 +is_raw=$2 +if_check=$3 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${dataset_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors=${cpu_name}_${dataset_name}"_numExecutors" +executor_cores=${cpu_name}_${dataset_name}"_executorCores" +executor_memory=${cpu_name}_${dataset_name}"_executorMemory" +driver_cores=${cpu_name}_${dataset_name}"_driverCores" +driver_memory=${cpu_name}_${dataset_name}"_driverMemory" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} + +echo "master : ${master}" +echo "deployMode : ${deployMode}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master} ] \ + || [ ! ${deployMode} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + + +echo "start to clean cache and sleep 3s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 3 + +echo "start to submit spark jobs --- TargetEncoder-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + mkdir -p log + spark-submit \ + --class com.bigdata.ml.TERunner \ + --master ${master} \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --conf spark.driver.memory=128g \ + --conf spark.locality.wait=0s \ + --conf spark.scheduler.minRegisteredResourcesRatio=1 \ + --conf spark.driver.maxResultSize=40g \ + --conf spark.network.timeout=60000s \ + --conf spark.rpc.askTimeout=60000s \ + --conf spark.executor.heartbeatInterval=600s \ + --conf spark.eventLog.enabled=false \ + --driver-class-path "lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --jars "lib/sparkling-water-assembly-extensions_2.12-3.38.0.1-1-3.1-all.jar,lib/sparkling-water-assembly_2.12-3.38.0.1-1-3.1-all.jar,lib/sparkling-water-assembly-scoring_2.12-3.38.0.1-1-3.1-all.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val}| tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.TERunner \ + --master ${master} \ + --deploy-mode ${deployMode} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --conf spark.driver.memory=128g \ + --conf spark.locality.wait=0s \ + --conf spark.scheduler.minRegisteredResourcesRatio=1 \ + --conf spark.driver.maxResultSize=40g \ + --conf spark.network.timeout=60000s \ + --conf spark.rpc.askTimeout=60000s \ + --conf spark.executor.heartbeatInterval=600s \ + --conf spark.eventLog.enabled=false \ + --driver-class-path "lib/snakeyaml-1.19.jar" \ + --jars "lib/sparkling-water-assembly-extensions_2.12-3.38.0.1-1-3.1-all.jar,lib/sparkling-water-assembly_2.12-3.38.0.1-1-3.1-all.jar,lib/sparkling-water-assembly-scoring_2.12-3.38.0.1-1-3.1-all.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${save_resultPath_val}| tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml/word2vec_run.sh b/tools/kal-test/bin/ml/word2vec_run.sh new file mode 100644 index 0000000..d3e7346 --- /dev/null +++ b/tools/kal-test/bin/ml/word2vec_run.sh @@ -0,0 +1,151 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1st argument: name of dataset: cate/node/item/taobao" + echo "2nd argument: name of API: fit/fit1/fit2/fit3" + echo "3rd argument:optimization algorithm or raw: no/yes" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 3 ]; then + usage + exit 0 +fi + +source conf/ml/word2vec/word2vec_spark.properties +dataset_name=$1 +api_name=$2 +is_raw=$3 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +# concatnate strings as a new variable +num_executors="numExectuors_"${dataset_name}_${cpu_name} +executor_cores="executorCores_"${dataset_name}_${cpu_name} +executor_memory="executorMemory_"${dataset_name}_${cpu_name} +extra_java_options="extraJavaOptions_"${dataset_name}_${cpu_name} +driver_cores="driverCores" +driver_memory="driverMemory" +memory_overhead="execMemOverhead" +master_="master" +deploy_mode="deployMode" +compress_="compress" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +memory_overhead_val=${!memory_overhead} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} +compress_val=${!compress_} + +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${memory_overhead} : ${memory_overhead_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${compress_} : ${compress_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ] \ + || [ ! ${memory_overhead_val} ] \ + || [ ! ${compress_val} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +data_path=alibaba_${dataset_name} +data_train=alibaba_${dataset_name}_downstreamTrainFile +data_test=alibaba_${dataset_name}_downstreamTestFile +data_path_val=${!data_path} +data_train_val=${!data_train} +data_test_val=${!data_test} + +echo "${dataset_name} : ${data_path_val}" +echo "downstreamTrainFile : ${data_train_val}" +echo "downstreamTestFile : ${data_test_val}" + +spark_conf=${master_val}_${deploy_mode_val}_${num_executors_val}_${executor_cores_val}_${executor_memory_val} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +model_conf=${dataset_name}-${api_name}-${scala_version_val} +echo "start to submit spark jobs --- word2vec-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/fastutil-8.3.1.jar lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.Word2VecRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.executor.memory_overhead=${memory_overhead_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --conf "spark.rdd.compress=${compress_val}" \ + --conf "spark.eventLog.enabled=false" \ + --jars "lib/snakeyaml-1.19.jar,lib/fastutil-8.3.1.jar,lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar,lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar:lib/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/fastutil-8.3.1.jar:/opt/ml_classpath/boostkit-ml-acc_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-core_${scala_version_val}-${kal_version_val}-${spark_version_val}.jar:/opt/ml_classpath/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${data_path_val} ${data_train_val} ${data_test_val} ${model_conf} ${is_raw} ${spark_conf} | tee ./log/log +else + spark-submit \ + --class com.bigdata.ml.Word2VecRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf "spark.executor.extraJavaOptions=${extra_java_options_val}" \ + --conf "spark.executor.instances=${num_executors_val}" \ + --conf "spark.executor.memory_overhead=${memory_overhead_val}" \ + --conf "spark.driver.maxResultSize=256G" \ + --conf "spark.rdd.compress=${compress_val}" \ + --conf "spark.eventLog.enabled=false" \ + --jars "lib/snakeyaml-1.19.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/fastutil-8.3.1.jar:lib/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${data_path_val} ${data_train_val} ${data_test_val} ${model_conf} ${is_raw} ${spark_conf} | tee ./log/log +fi diff --git a/tools/kal-test/bin/ml/xgbt_run.sh b/tools/kal-test/bin/ml/xgbt_run.sh new file mode 100644 index 0000000..a2bbb11 --- /dev/null +++ b/tools/kal-test/bin/ml/xgbt_run.sh @@ -0,0 +1,153 @@ +#!/bin/bash +set -e + +function usage() { + echo "Usage: " + echo "1rd argument: name of dataset: e.g. higgs/mnist8m" + echo "2st argument: type of algorithm: [classification/regression]" + echo "3th argument: optimization algorithm or raw: [no/yes]" + echo "4th argument: Whether to Compare Results [no/yes]" +} + +case "$1" in +-h | --help | ?) + usage + exit 0 + ;; +esac + +if [ $# -ne 4 ]; then + usage + exit 0 +fi + +source conf/ml/xgbt/xgbt_spark.properties +dataset_name=$1 +algorithm_type=$2 +is_raw=$3 +if_check=$4 +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') +model_conf=${algorithm_type}-${dataset_name}-${is_raw}-${if_check} + +# concatnate strings as a new variable +num_executors=${cpu_name}_${algorithm_type}"_"${dataset_name}"_numExecutors" +executor_cores=${cpu_name}_${algorithm_type}"_"${dataset_name}"_executorCores" +executor_memory=${cpu_name}_${algorithm_type}"_"${dataset_name}"_executorMemory" +extra_java_options=${cpu_name}_${algorithm_type}"_"${dataset_name}"_extraJavaOptions" +driver_cores=${cpu_name}_${algorithm_type}"_"${dataset_name}"_driverCores" +driver_memory=${cpu_name}_${algorithm_type}"_"${dataset_name}"_driverMemory" +task_cpus=${cpu_name}_${algorithm_type}"_"${dataset_name}"_taskCpus" +num_partitions=${cpu_name}_${algorithm_type}"_"${dataset_name}"_numPartitions" +master_="master" +deploy_mode="deployMode" + +num_executors_val=${!num_executors} +executor_cores_val=${!executor_cores} +executor_memory_val=${!executor_memory} +extra_java_options_val=${!extra_java_options} +driver_cores_val=${!driver_cores} +driver_memory_val=${!driver_memory} +task_cpus_val=${!task_cpus} +num_partitions_val=${!num_partitions} +master_val=${!master_} +deploy_mode_val=${!deploy_mode} + + +echo "${master_} : ${master_val}" +echo "${deploy_mode} : ${deploy_mode_val}" +echo "${driver_cores} : ${driver_cores_val}" +echo "${driver_memory} : ${driver_memory_val}" +echo "${num_executors} : ${num_executors_val}" +echo "${executor_cores}: ${executor_cores_val}" +echo "${executor_memory} : ${executor_memory_val}" +echo "${extra_java_options} : ${extra_java_options_val}" +echo "${task_cpus} : ${task_cpus_val}" +echo "${num_partitions} : ${num_partitions_val}" +echo "cpu_name : ${cpu_name}" + +if [ ! ${num_executors_val} ] \ + || [ ! ${executor_cores_val} ] \ + || [ ! ${executor_memory_val} ] \ + || [ ! ${extra_java_options_val} ] \ + || [ ! ${driver_cores_val} ] \ + || [ ! ${driver_memory_val} ] \ + || [ ! ${master_val} ] \ + || [ ! ${task_cpus_val} ] \ + || [ ! ${num_partitions_val} ] \ + || [ ! ${deploy_mode_val} ] \ + || [ ! ${cpu_name} ]; then + echo "Some values are NULL, please confirm with the property files" + exit 0 +fi + + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +save_resultPath=saveResultPath +save_resultPath_val=${!save_resultPath} +data_path_val=${!dataset_name} +echo "${dataset_name} : ${data_path_val}" + + +echo "start to submit spark jobs" + +spark_conf=${master_val}_${deploy_mode_val}_${num_executors_val}_${executor_cores_val}_${executor_memory_val}_${extra_java_options_val}_${driver_cores_val}_${driver_memory_val}_${task_cpus_val}_${num_partitions_val} + +echo "start to clean cache and sleep 30s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 30 + +echo "start to submit spark jobs --- XGBT-${model_conf}" +if [ ${is_raw} == "no" ]; then + scp lib/snakeyaml-1.19.jar lib/boostkit-xgboost4j-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar lib/boostkit-xgboost4j_${scala_version_val}-${kal_version_val}-${cpu_name}.jar lib/boostkit-xgboost4j-${spark_version_val}_${scala_version_val}-${kal_version_val}-${cpu_name}.jar root@agent1:/opt/ml_classpath/ + scp lib/snakeyaml-1.19.jar lib/boostkit-xgboost4j-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar lib/boostkit-xgboost4j_${scala_version_val}-${kal_version_val}-${cpu_name}.jar lib/boostkit-xgboost4j-${spark_version_val}_${scala_version_val}-${kal_version_val}-${cpu_name}.jar root@agent2:/opt/ml_classpath/ + scp lib/snakeyaml-1.19.jar lib/boostkit-xgboost4j-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar lib/boostkit-xgboost4j_${scala_version_val}-${kal_version_val}-${cpu_name}.jar lib/boostkit-xgboost4j-${spark_version_val}_${scala_version_val}-${kal_version_val}-${cpu_name}.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.XGBTRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf spark.task.cpus=${task_cpus_val} \ + --conf spark.executor.extraJavaOptions=${extra_java_options_val} \ + --conf spark.executorEnv.LD_LIBRARY_PATH="./lib/:${LD_LIBRARY_PATH}" \ + --conf spark.executor.extraLibraryPath="./lib" \ + --conf spark.driver.extraLibraryPath="./lib" \ + --files=lib/libboostkit_xgboost_kernel.so \ + --jars "lib/boostkit-xgboost4j-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar,lib/boostkit-xgboost4j_${scala_version_val}-${kal_version_val}-${cpu_name}.jar,lib/boostkit-xgboost4j-${spark_version_val}_${scala_version_val}-${kal_version_val}-${cpu_name}.jar,lib/snakeyaml-1.19.jar,lib/kal-test_${scala_version_val}-0.1.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/boostkit-xgboost4j-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar:lib/boostkit-xgboost4j_${scala_version_val}-${kal_version_val}-${cpu_name}.jar:lib/boostkit-xgboost4j-${spark_version_val}_${scala_version_val}-${kal_version_val}-${cpu_name}.jar:lib/snakeyaml-1.19.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/boostkit-xgboost4j-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar:/opt/ml_classpath/boostkit-xgboost4j_${scala_version_val}-${kal_version_val}-${cpu_name}.jar:/opt/ml_classpath/boostkit-xgboost4j-${spark_version_val}_${scala_version_val}-${kal_version_val}-${cpu_name}.jar:/opt/ml_classpath/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val} | tee ./log/log +else + scp lib/snakeyaml-1.19.jar lib/xgboost4j_${scala_version_val}-1.1.0.jar lib/xgboost4j-spark_${scala_version_val}-1.1.0.jar root@agent1:/opt/ml_classpath/ + scp lib/snakeyaml-1.19.jar lib/xgboost4j_${scala_version_val}-1.1.0.jar lib/xgboost4j-spark_${scala_version_val}-1.1.0.jar root@agent2:/opt/ml_classpath/ + scp lib/snakeyaml-1.19.jar lib/xgboost4j_${scala_version_val}-1.1.0.jar lib/xgboost4j-spark_${scala_version_val}-1.1.0.jar root@agent3:/opt/ml_classpath/ + + spark-submit \ + --class com.bigdata.ml.XGBTRunner \ + --deploy-mode ${deploy_mode_val} \ + --driver-cores ${driver_cores_val} \ + --driver-memory ${driver_memory_val} \ + --num-executors ${num_executors_val} \ + --executor-cores ${executor_cores_val} \ + --executor-memory ${executor_memory_val} \ + --master ${master_val} \ + --conf spark.task.cpus=${task_cpus_val} \ + --conf spark.executor.extraJavaOptions=${extra_java_options_val} \ + --jars "lib/xgboost4j_${scala_version_val}-1.1.0.jar,lib/xgboost4j-spark_${scala_version_val}-1.1.0.jar,lib/snakeyaml-1.19.jar,lib/kal-test_${scala_version_val}-0.1.jar" \ + --driver-class-path "lib/kal-test_${scala_version_val}-0.1.jar:lib/xgboost4j_${scala_version_val}-1.1.0.jar:lib/xgboost4j-spark_${scala_version_val}-1.1.0.jar:lib/snakeyaml-1.19.jar" \ + --conf "spark.executor.extraClassPath=/opt/ml_classpath/xgboost4j_${scala_version_val}-1.1.0.jar:/opt/ml_classpath/xgboost4j-spark_${scala_version_val}-1.1.0.jar:/opt/ml_classpath/snakeyaml-1.19.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar ${model_conf} ${data_path_val} ${cpu_name} ${spark_conf} ${save_resultPath_val} | tee ./log/log +fi \ No newline at end of file diff --git a/tools/kal-test/bin/ml_workflow.sh b/tools/kal-test/bin/ml_workflow.sh new file mode 100644 index 0000000..a5282a7 --- /dev/null +++ b/tools/kal-test/bin/ml_workflow.sh @@ -0,0 +1,414 @@ +#!/bin/bash +set -e + +case "$1" in +-h | --help | ?) + echo "Usage: " + echo "1st argument: optimization algorithm or raw: no/yes" + echo "2nd argument: Whether to Compare Results: no/yes" + exit 0 + ;; +esac + +if [ $# -ne 2 ]; then + echo "please input 1 arguments: " + echo "1st argument: optimization algorithm or raw: no/yes" + echo "2nd argument: Whether to Compare Results: no/yes" + exit 0 +fi + +is_raw=$1 +if_check=$2 +type=arm +if [ $is_raw == "yes" ]; then + type=raw +fi + +function createDir() { + dir=$1 + if [ ! -d $dir ]; then + mkdir $dir + fi +} +createDir logs +createDir log + +ml_classpath=/opt/ml_classpath/ +function ssh_mkdir() { + server=$1 + dir=$2 + ssh $server "mkdir -p $dir" +} +ssh_mkdir agent1 $ml_classpath +ssh_mkdir agent2 $ml_classpath +ssh_mkdir agent3 $ml_classpath + +# ALS +bash bin/ml/als_run.sh dataframe als fit ${is_raw} ${if_check} 2>&1 | tee -a logs/als_als_fit_${type}.log +bash bin/ml/als_run.sh dataframe als fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/als_als_fit1_${type}.log +bash bin/ml/als_run.sh dataframe als fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/als_als_fit2_${type}.log +bash bin/ml/als_run.sh dataframe als fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/als_als_fit3_${type}.log + +bash bin/ml/als_run.sh dataframe alsbs fit ${is_raw} ${if_check} 2>&1 | tee -a logs/als_alsbs_fit_${type}.log +bash bin/ml/als_run.sh dataframe alsbs fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/als_alsbs_fit1_${type}.log +bash bin/ml/als_run.sh dataframe alsbs fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/als_alsbs_fit2_${type}.log +bash bin/ml/als_run.sh dataframe alsbs fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/als_alsbs_fit3_${type}.log + +bash bin/ml/als_run.sh dataframe alsh fit ${is_raw} ${if_check} 2>&1 | tee -a logs/als_alsh_fit_${type}.log +bash bin/ml/als_run.sh dataframe alsh fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/als_alsh_fit1_${type}.log +bash bin/ml/als_run.sh dataframe alsh fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/als_alsh_fit2_${type}.log +bash bin/ml/als_run.sh dataframe alsh fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/als_alsh_fit3_${type}.log + +#BO +bash bin/ml/bo_run.sh BostonHousing ${is_raw} ${if_check} 2>&1 | tee -a logs/bo_BostonHousing_${type}.log +bash bin/ml/bo_run.sh TitanicRf ${is_raw} ${if_check} 2>&1 | tee -a logs/bo_TitanicRf_${type}.log +bash bin/ml/bo_run.sh TitanicGBT ${is_raw} ${if_check} 2>&1 | tee -a logs/bo_TitanicGBT_${type}.log + +# Cov +bash bin/ml/cov_run.sh CP10M1K ${is_raw} ${if_check} 2>&1 | tee -a logs/cov_CP10M1K_${type}.log +bash bin/ml/cov_run.sh CP2M5K ${is_raw} ${if_check} 2>&1 | tee -a logs/cov_CP2M5K_${type}.log +bash bin/ml/cov_run.sh CP1M10K ${is_raw} ${if_check} 2>&1 | tee -a logs/cov_CP1M10K_${type}.log + +#DBSCAN +bash bin/ml/dbscan_run.sh bremenSmall ${is_raw} 2>&1 | tee -a logs/dbscan_bremenSmall_${type}.log +bash bin/ml/dbscan_run.sh farm ${is_raw} 2>&1 | tee -a logs/dbscan_farm_${type}.log +bash bin/ml/dbscan_run.sh house ${is_raw} 2>&1 | tee -a logs/dbscan_house_${type}.log + +# dtb +bash bin/ml/dtb_run.sh higgs fit verify ${is_raw} 2>&1 | tee -a logs/dtb_higgs_fit_${type}.log +bash bin/ml/dtb_run.sh higgs fit1 verify ${is_raw} 2>&1 | tee -a logs/dtb_higgs_fi1_${type}.log +bash bin/ml/dtb_run.sh higgs fit2 verify ${is_raw} 2>&1 | tee -a logs/dtb_higgs_fit2_${type}.log +bash bin/ml/dtb_run.sh higgs fit3 verify ${is_raw} 2>&1 | tee -a logs/dtb_higgs_fit3_${type}.log + +bash bin/ml/dtb_run.sh mnist8m fit verify ${is_raw} 2>&1 | tee -a logs/dtb_mnist8m_fit_${type}.log +bash bin/ml/dtb_run.sh mnist8m fit1 verify ${is_raw} 2>&1 | tee -a logs/dtb_mnist8m_fi1_${type}.log +bash bin/ml/dtb_run.sh mnist8m fit2 verify ${is_raw} 2>&1 | tee -a logs/dtb_mnist8m_fit2_${type}.log +bash bin/ml/dtb_run.sh mnist8m fit3 verify ${is_raw} 2>&1 | tee -a logs/dtb_mnist8m_fit3_${type}.log + +# DT +bash bin/ml/dt_run.sh classification dataframe epsilon fit ${is_raw} ${if_check} 2>&1 | tee -a logs/dtc_epsilon_fit_${type}.log +bash bin/ml/dt_run.sh classification dataframe epsilon fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/dtc_epsilon_fit1_${type}.log +bash bin/ml/dt_run.sh classification dataframe epsilon fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/dtc_epsilon_fit2_${type}.log +bash bin/ml/dt_run.sh classification dataframe epsilon fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/dtc_epsilon_fit3_${type}.log + +bash bin/ml/dt_run.sh classification dataframe higgs fit ${is_raw} ${if_check} 2>&1 | tee -a logs/dtc_higgs_fit_${type}.log +bash bin/ml/dt_run.sh classification dataframe higgs fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/dtc_higgs_fit1_${type}.log +bash bin/ml/dt_run.sh classification dataframe higgs fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/dtc_higgs_fit2_${type}.log +bash bin/ml/dt_run.sh classification dataframe higgs fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/dtc_higgs_fit3_${type}.log + +bash bin/ml/dt_run.sh classification dataframe mnist8m fit ${is_raw} ${if_check} 2>&1 | tee -a logs/dtc_mnist8m_fit_${type}.log +bash bin/ml/dt_run.sh classification dataframe mnist8m fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/dtc_mnist8m_fit1_${type}.log +bash bin/ml/dt_run.sh classification dataframe mnist8m fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/dtc_mnist8m_fit2_${type}.log +bash bin/ml/dt_run.sh classification dataframe mnist8m fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/dtc_mnist8m_fit3_${type}.log + +bash bin/ml/dt_run.sh regression dataframe epsilon fit ${is_raw} ${if_check} 2>&1 | tee -a logs/dtr_epsilon_fit_${type}.log +bash bin/ml/dt_run.sh regression dataframe epsilon fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/dtr_epsilon_fit1_${type}.log +bash bin/ml/dt_run.sh regression dataframe epsilon fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/dtr_epsilon_fit2_${type}.log +bash bin/ml/dt_run.sh regression dataframe epsilon fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/dtr_epsilon_fit3_${type}.log + +bash bin/ml/dt_run.sh regression dataframe higgs fit ${is_raw} ${if_check} 2>&1 | tee -a logs/dtr_higgs_fit_${type}.log +bash bin/ml/dt_run.sh regression dataframe higgs fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/dtr_higgs_fit1_${type}.log +bash bin/ml/dt_run.sh regression dataframe higgs fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/dtr_higgs_fit2_${type}.log +bash bin/ml/dt_run.sh regression dataframe higgs fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/dtr_higgs_fit3_${type}.log + +bash bin/ml/dt_run.sh regression dataframe mnist8m fit ${is_raw} ${if_check} 2>&1 | tee -a logs/dtr_mnist8m_fit_${type}.log +bash bin/ml/dt_run.sh regression dataframe mnist8m fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/dtr_mnist8m_fit1_${type}.log +bash bin/ml/dt_run.sh regression dataframe mnist8m fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/dtr_mnist8m_fit2_${type}.log +bash bin/ml/dt_run.sh regression dataframe mnist8m fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/dtr_mnist8m_fit3_${type}.log + +# encoder +bash bin/ml/encoder_run.sh encoder_400m ${is_raw} ${if_check} 2>&1 | tee -a logs/encoder_encoder_400m_${type}.log +bash bin/ml/encoder_run.sh encoder_800m ${is_raw} ${if_check} 2>&1 | tee -a logs/encoder_encoder_800m_${type}.log + +#fm +bash bin/ml/fm_run.sh classification epsilon fit ${is_raw} ${if_check} 2>&1 | tee -a logs/fmc_epsilon_${type}.log +bash bin/ml/fm_run.sh regression epsilon fit ${is_raw} ${if_check} 2>&1 | tee -a logs/fmr_epsilon_${type}.log +bash bin/ml/fm_run.sh classification higgs fit ${is_raw} ${if_check} 2>&1 | tee -a logs/fmc_higgs_${type}.log +bash bin/ml/fm_run.sh regression higgs fit ${is_raw} ${if_check} 2>&1 | tee -a logs/fmr_higgs_${type}.log +bash bin/ml/fm_run.sh classification avazu fit ${is_raw} ${if_check} 2>&1 | tee -a logs/fmc_avazu_${type}.log +bash bin/ml/fm_run.sh regression avazu fit ${is_raw} ${if_check} 2>&1 | tee -a logs/fmr_avazu_${type}.log +bash bin/ml/fm_run.sh classification kdda fit ${is_raw} ${if_check} 2>&1 | tee -a logs/fmc_kdda_${type}.log +bash bin/ml/fm_run.sh regression kdda fit ${is_raw} ${if_check} 2>&1 | tee -a logs/fmr_kdda_${type}.log + +#fpg +bash bin/ml/fpg_run.sh Kosarak ${is_raw} ${if_check} 2>&1 | tee -a logs/fpg_Kosarak_${type}.log +bash bin/ml/fpg_run.sh Kosarak25 ${is_raw} ${if_check} 2>&1 | tee -a logs/fpg_Kosarak25_${type}.log +bash bin/ml/fpg_run.sh IBM700 ${is_raw} ${if_check} 2>&1 | tee -a logs/fpg_IBM700_${type}.log + +# gbdt +bash bin/ml/gbdt_run.sh classification dataframe epsilon fit ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtc_epsilon_fit_${type}.log +bash bin/ml/gbdt_run.sh classification dataframe epsilon fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtc_epsilon_fit1_${type}.log +bash bin/ml/gbdt_run.sh classification dataframe epsilon fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtc_epsilon_fit2_${type}.log +bash bin/ml/gbdt_run.sh classification dataframe epsilon fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtc_epsilon_fit3_${type}.log + +bash bin/ml/gbdt_run.sh classification dataframe rcv fit ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtc_rcv_fit_${type}.log +bash bin/ml/gbdt_run.sh classification dataframe rcv fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtc_rcv_fit1_${type}.log +bash bin/ml/gbdt_run.sh classification dataframe rcv fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtc_rcv_fit2_${type}.log +bash bin/ml/gbdt_run.sh classification dataframe rcv fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtc_rcv_fit3_${type}.log + +bash bin/ml/gbdt_run.sh classification dataframe D10M4096libsvm fit ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtc_D10M4096libsvm_fit_${type}.log +bash bin/ml/gbdt_run.sh classification dataframe D10M4096libsvm fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtc_D10M4096libsvm_fit1_${type}.log +bash bin/ml/gbdt_run.sh classification dataframe D10M4096libsvm fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtc_D10M4096libsvm_fit2_${type}.log +bash bin/ml/gbdt_run.sh classification dataframe D10M4096libsvm fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtc_D10M4096libsvm_fit3_${type}.log + +bash bin/ml/gbdt_run.sh regression dataframe epsilon fit ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtr_epsilon_fit_${type}.log +bash bin/ml/gbdt_run.sh regression dataframe epsilon fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtr_epsilon_fit1_${type}.log +bash bin/ml/gbdt_run.sh regression dataframe epsilon fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtr_epsilon_fit2_${type}.log +bash bin/ml/gbdt_run.sh regression dataframe epsilon fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtr_epsilon_fit3_${type}.log + +bash bin/ml/gbdt_run.sh regression dataframe rcv fit ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtr_rcv_fit_${type}.log +bash bin/ml/gbdt_run.sh regression dataframe rcv fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtr_rcv_fit1_${type}.log +bash bin/ml/gbdt_run.sh regression dataframe rcv fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtr_rcv_fit2_${type}.log +bash bin/ml/gbdt_run.sh regression dataframe rcv fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtr_rcv_fit3_${type}.log + +bash bin/ml/gbdt_run.sh regression dataframe D10M4096libsvm fit ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtr_D10M4096libsvm_fit_${type}.log +bash bin/ml/gbdt_run.sh regression dataframe D10M4096libsvm fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtr_D10M4096libsvm_fit1_${type}.log +bash bin/ml/gbdt_run.sh regression dataframe D10M4096libsvm fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtr_D10M4096libsvm_fit2_${type}.log +bash bin/ml/gbdt_run.sh regression dataframe D10M4096libsvm fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/gbdtr_D10M4096libsvm_fit3_${type}.log + +#hdb +bash bin/ml/hdb_run.sh Hibench1m_100 ${is_raw} ${if_check} 2>&1 | tee -a logs/hdb_Hibench1m_100_${type}.log +bash bin/ml/hdb_run.sh Hibench1m_200 ${is_raw} ${if_check} 2>&1 | tee -a logs/hdb_Hibench1m_200_${type}.log + +# IDF +bash bin/ml/idf_run.sh D10m200m ${is_raw} ${if_check} 2>&1 | tee -a logs/idf_D10m200m_${type}.log +bash bin/ml/idf_run.sh D2g250m ${is_raw} ${if_check} 2>&1 | tee -a logs/idf_D2g250m_${type}.log + +#if +bash bin/ml/if_run.sh if_40M_1k ${is_raw} ${if_check} 2>&1 | tee -a logs/if_if_40M_1k_${type}.log +bash bin/ml/if_run.sh if_1M_1k ${is_raw} ${if_check} 2>&1 | tee -a logs/if_if_1M_1k_${type}.log + +# KMEANS +bash bin/ml/kmeans_run.sh dataframe D1200M20 fit ${is_raw} ${if_check} 2>&1 | tee -a logs/kmeans_D1200M20_fit_${type}.log +bash bin/ml/kmeans_run.sh dataframe D1200M20 fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/kmeans_D1200M20_fit1_${type}.log +bash bin/ml/kmeans_run.sh dataframe D1200M20 fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/kmeans_D1200M20_fit2_${type}.log +bash bin/ml/kmeans_run.sh dataframe D1200M20 fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/kmeans_D1200M20_fit3_${type}.log + +bash bin/ml/kmeans_run.sh dataframe D200M20 fit ${is_raw} ${if_check} 2>&1 | tee -a logs/kmeans_D200M20_fit_${type}.log +bash bin/ml/kmeans_run.sh dataframe D200M20 fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/kmeans_D200M20_fit1_${type}.log +bash bin/ml/kmeans_run.sh dataframe D200M20 fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/kmeans_D200M20_fit2_${type}.log +bash bin/ml/kmeans_run.sh dataframe D200M20 fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/kmeans_D200M20_fit3_${type}.log + +bash bin/ml/kmeans_run.sh dataframe D200M100 fit ${is_raw} ${if_check} 2>&1 | tee -a logs/kmeans_D200M100_fit_${type}.log +bash bin/ml/kmeans_run.sh dataframe D200M100 fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/kmeans_D200M100_fit1_${type}.log +bash bin/ml/kmeans_run.sh dataframe D200M100 fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/kmeans_D200M100_fit2_${type}.log +bash bin/ml/kmeans_run.sh dataframe D200M100 fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/kmeans_D200M100_fit3_${type}.log + +#knn +bash bin/ml/knn_run.sh glove ${is_raw} 2>&1 | tee -a logs/knn_glove_${type}.log +bash bin/ml/knn_run.sh gist ${is_raw} 2>&1 | tee -a logs/knn_gist_${type}.log +bash bin/ml/knn_run.sh deep1b ${is_raw} 2>&1 | tee -a logs/knn_deep1b_${type}.log + +#lda +bash bin/ml/lda_run.sh dataframe nytimes fit ${is_raw} ${if_check} 2>&1 | tee -a logs/lda_nytimes_fit_${type}.log +bash bin/ml/lda_run.sh dataframe nytimes fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/lda_nytimes_fit1_${type}.log +bash bin/ml/lda_run.sh dataframe nytimes fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/lda_nytimes_fit2_${type}.log +bash bin/ml/lda_run.sh dataframe nytimes fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/lda_nytimes_fit3_${type}.log + +bash bin/ml/lda_run.sh dataframe pubmed fit ${is_raw} ${if_check} 2>&1 | tee -a logs/lda_pubmed_fit_${type}.log +bash bin/ml/lda_run.sh dataframe pubmed fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/lda_pubmed_fit1_${type}.log +bash bin/ml/lda_run.sh dataframe pubmed fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/lda_pubmed_fit2_${type}.log +bash bin/ml/lda_run.sh dataframe pubmed fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/lda_pubmed_fit3_${type}.log + +bash bin/ml/lda_run.sh dataframe D20M200K fit ${is_raw} ${if_check} 2>&1 | tee -a logs/lda_D20M200K_fit_${type}.log +bash bin/ml/lda_run.sh dataframe D20M200K fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/lda_D20M200K_fit1_${type}.log +bash bin/ml/lda_run.sh dataframe D20M200K fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/lda_D20M200K_fit2_${type}.log +bash bin/ml/lda_run.sh dataframe D20M200K fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/lda_D20M200K_fit3_${type}.log + +#lgbm +bash bin/ml/lgbm_run.sh regression mnist8m ${is_raw} ${if_check} 2>&1 | tee -a logs/lgbmr_mnist8m_${type}.log +bash bin/ml/lgbm_run.sh regression higgs ${is_raw} ${if_check} 2>&1 | tee -a logs/lgbmr_higgs_${type}.log +bash bin/ml/lgbm_run.sh classification mnist8m ${is_raw} ${if_check} 2>&1 | tee -a logs/lgbmc_mnist8m_${type}.log +bash bin/ml/lgbm_run.sh classification higgs ${is_raw} ${if_check} 2>&1 | tee -a logs/lgbmc_higgs_${type}.log + +#linR +bash bin/ml/linR_run.sh mnist8m fit ${is_raw} ${if_check} 2>&1 | tee -a logs/linR_mnist8m_fit_${type}.log +bash bin/ml/linR_run.sh mnist8m fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/linR_mnist8m_fit1_${type}.log +bash bin/ml/linR_run.sh mnist8m fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/linR_mnist8m_fit2_${type}.log +bash bin/ml/linR_run.sh mnist8m fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/linR_mnist8m_fit3_${type}.log + +bash bin/ml/linR_run.sh Twitter fit ${is_raw} ${if_check} 2>&1 | tee -a logs/linR_Twitter_fit_${type}.log +bash bin/ml/linR_run.sh Twitter fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/linR_Twitter_fit1_${type}.log +bash bin/ml/linR_run.sh Twitter fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/linR_Twitter_fit2_${type}.log +bash bin/ml/linR_run.sh Twitter fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/linR_Twitter_fit3_${type}.log + +bash bin/ml/linR_run.sh rcv fit ${is_raw} ${if_check} 2>&1 | tee -a logs/linR_rcv_fit_${type}.log +bash bin/ml/linR_run.sh rcv fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/linR_rcv_fit1_${type}.log +bash bin/ml/linR_run.sh rcv fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/linR_rcv_fit2_${type}.log +bash bin/ml/linR_run.sh rcv fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/linR_rcv_fit3_${type}.log + +#logR +bash bin/ml/logR_run.sh mnist8m fit ${is_raw} ${if_check} 2>&1 | tee -a logs/logR_mnist8m_fit_${type}.log +bash bin/ml/logR_run.sh mnist8m fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/logR_mnist8m_fit1_${type}.log +bash bin/ml/logR_run.sh mnist8m fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/logR_mnist8m_fit2_${type}.log +bash bin/ml/logR_run.sh mnist8m fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/logR_mnist8m_fit3_${type}.log + +bash bin/ml/logR_run.sh Twitter fit ${is_raw} ${if_check} 2>&1 | tee -a logs/logR_Twitter_fit_${type}.log +bash bin/ml/logR_run.sh Twitter fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/logR_Twitter_fit1_${type}.log +bash bin/ml/logR_run.sh Twitter fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/logR_Twitter_fit2_${type}.log +bash bin/ml/logR_run.sh Twitter fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/logR_Twitter_fit3_${type}.log + +bash bin/ml/logR_run.sh rcv fit ${is_raw} ${if_check} 2>&1 | tee -a logs/logR_rcv_fit_${type}.log +bash bin/ml/logR_run.sh rcv fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/logR_rcv_fit1_${type}.log +bash bin/ml/logR_run.sh rcv fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/logR_rcv_fit2_${type}.log +bash bin/ml/logR_run.sh rcv fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/logR_rcv_fit3_${type}.log + +#nmf +bash bin/ml/nmf_run.sh CSJ ${is_raw} ${if_check} 2>&1 | tee -a logs/nmf_CSJ_${type}.log +bash bin/ml/nmf_run.sh MT ${is_raw} ${if_check} 2>&1 | tee -a logs/nmf_MT_${type}.log +bash bin/ml/nmf_run.sh Books ${is_raw} ${if_check} 2>&1 | tee -a logs/nmf_Books_${type}.log + +#pca +bash bin/ml/pca_run.sh dataframe D10M1K fit ${is_raw} ${if_check} 2>&1 | tee -a logs/pca_D10M1K_fit_${type}.log +bash bin/ml/pca_run.sh dataframe D10M1K fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/pca_D10M1K_fit1_${type}.log +bash bin/ml/pca_run.sh dataframe D10M1K fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/pca_D10M1K_fit2_${type}.log +bash bin/ml/pca_run.sh dataframe D10M1K fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/pca_D10M1K_fit3_${type}.log + +bash bin/ml/pca_run.sh dataframe D1M10K fit ${is_raw} ${if_check} 2>&1 | tee -a logs/pca_D1M10K_fit_${type}.log +bash bin/ml/pca_run.sh dataframe D1M10K fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/pca_D1M10K_fit1_${type}.log +bash bin/ml/pca_run.sh dataframe D1M10K fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/pca_D1M10K_fit2_${type}.log +bash bin/ml/pca_run.sh dataframe D1M10K fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/pca_D1M10K_fit3_${type}.log + +bash bin/ml/pca_run.sh dataframe MESH fit ${is_raw} ${if_check} 2>&1 | tee -a logs/pca_MESH_fit_${type}.log +bash bin/ml/pca_run.sh dataframe MESH fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/pca_MESH_fit1_${type}.log +bash bin/ml/pca_run.sh dataframe MESH fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/pca_MESH_fit2_${type}.log +bash bin/ml/pca_run.sh dataframe MESH fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/pca_MESH_fit3_${type}.log + +#pearson +bash bin/ml/pearson_run.sh dataframe CP10M1K ${is_raw} ${if_check} 2>&1 | tee -a logs/pearson_CP10M1K_${type}.log +bash bin/ml/pearson_run.sh dataframe CP2M5K ${is_raw} ${if_check} 2>&1 | tee -a logs/pearson_CP2M5K_${type}.log +bash bin/ml/pearson_run.sh dataframe CP1M10K ${is_raw} ${if_check} 2>&1 | tee -a logs/pearson_CP1M10K_${type}.log + +#ps +bash bin/ml/ps_run.sh kosarak ${is_raw} ${if_check} 2>&1 | tee -a logs/ps_kosarak_${type}.log +bash bin/ml/ps_run.sh IBM10M47 ${is_raw} ${if_check} 2>&1 | tee -a logs/ps_IBM10M47_${type}.log +bash bin/ml/ps_run.sh IBM100M47 ${is_raw} ${if_check} 2>&1 | tee -a logs/ps_IBM100M47_${type}.log + +# RF +bash bin/ml/rf_run.sh classification dataframe epsilon fit ${is_raw} ${if_check} 2>&1 | tee -a logs/rfc_epsilon_fit_${type}.log +bash bin/ml/rf_run.sh classification dataframe epsilon fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfc_epsilon_fit1_${type}.log +bash bin/ml/rf_run.sh classification dataframe epsilon fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfc_epsilon_fit2_${type}.log +bash bin/ml/rf_run.sh classification dataframe epsilon fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfc_epsilon_fit3_${type}.log + +bash bin/ml/rf_run.sh classification dataframe higgs fit ${is_raw} ${if_check} 2>&1 | tee -a logs/rfc_higgs_fit_${type}.log +bash bin/ml/rf_run.sh classification dataframe higgs fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfc_higgs_fit1_${type}.log +bash bin/ml/rf_run.sh classification dataframe higgs fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfc_higgs_fit2_${type}.log +bash bin/ml/rf_run.sh classification dataframe higgs fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfc_higgs_fit3_${type}.log + +bash bin/ml/rf_run.sh classification dataframe mnist8m fit ${is_raw} ${if_check} 2>&1 | tee -a logs/rfc_mnist8m_fit_${type}.log +bash bin/ml/rf_run.sh classification dataframe mnist8m fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfc_mnist8m_fit1_${type}.log +bash bin/ml/rf_run.sh classification dataframe mnist8m fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfc_mnist8m_fit2_${type}.log +bash bin/ml/rf_run.sh classification dataframe mnist8m fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfc_mnist8m_fit3_${type}.log + +bash bin/ml/rf_run.sh classification dataframe rcv fit ${is_raw} ${if_check} 2>&1 | tee -a logs/rfc_rcv_fit_${type}.log +bash bin/ml/rf_run.sh classification dataframe rcv fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfc_rcv_fit1_${type}.log +bash bin/ml/rf_run.sh classification dataframe rcv fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfc_rcv_fit2_${type}.log +bash bin/ml/rf_run.sh classification dataframe rcv fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfc_rcv_fit3_${type}.log + +bash bin/ml/rf_run.sh regression dataframe epsilon fit ${is_raw} ${if_check} 2>&1 | tee -a logs/rfr_epsilon_fit_${type}.log +bash bin/ml/rf_run.sh regression dataframe epsilon fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfr_epsilon_fit1_${type}.log +bash bin/ml/rf_run.sh regression dataframe epsilon fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfr_epsilon_fit2_${type}.log +bash bin/ml/rf_run.sh regression dataframe epsilon fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfr_epsilon_fit3_${type}.log + +bash bin/ml/rf_run.sh regression dataframe higgs fit ${is_raw} ${if_check} 2>&1 | tee -a logs/rfr_higgs_fit_${type}.log +bash bin/ml/rf_run.sh regression dataframe higgs fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfr_higgs_fit1_${type}.log +bash bin/ml/rf_run.sh regression dataframe higgs fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfr_higgs_fit2_${type}.log +bash bin/ml/rf_run.sh regression dataframe higgs fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfr_higgs_fit3_${type}.log + +bash bin/ml/rf_run.sh regression dataframe mnist8m fit ${is_raw} ${if_check} 2>&1 | tee -a logs/rfr_mnist8m_fit_${type}.log +bash bin/ml/rf_run.sh regression dataframe mnist8m fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfr_mnist8m_fit1_${type}.log +bash bin/ml/rf_run.sh regression dataframe mnist8m fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfr_mnist8m_fit2_${type}.log +bash bin/ml/rf_run.sh regression dataframe mnist8m fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfr_mnist8m_fit3_${type}.log + +bash bin/ml/rf_run.sh regression dataframe rcv fit ${is_raw} ${if_check} 2>&1 | tee -a logs/rfr_rcv_fit_${type}.log +bash bin/ml/rf_run.sh regression dataframe rcv fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfr_rcv_fit1_${type}.log +bash bin/ml/rf_run.sh regression dataframe rcv fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfr_rcv_fit2_${type}.log +bash bin/ml/rf_run.sh regression dataframe rcv fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/rfr_rcv_fit3_${type}.log + +#simrank +bash bin/ml/simrank_run.sh simrank3w ${is_raw} ${if_check} 2>&1 | tee -a logs/simrank_simrank3w_${type}.log +bash bin/ml/simrank_run.sh simrank5w ${is_raw} ${if_check} 2>&1 | tee -a logs/simrank_simrank5w_${type}.log +bash bin/ml/simrank_run.sh simrank7w ${is_raw} ${if_check} 2>&1 | tee -a logs/simrank_simrank7w_${type}.log + +#spca +bash bin/ml/spca_run.sh Kemelmacher fit ${is_raw} ${if_check} 2>&1 | tee -a logs/spca_Kemelmacher_fit_${type}.log +bash bin/ml/spca_run.sh Kemelmacher fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/spca_Kemelmacher_fit1_${type}.log +bash bin/ml/spca_run.sh Kemelmacher fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/spca_Kemelmacher_fit2_${type}.log +bash bin/ml/spca_run.sh Kemelmacher fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/spca_Kemelmacher_fit3_${type}.log + +bash bin/ml/spca_run.sh mesh_deform fit ${is_raw} ${if_check} 2>&1 | tee -a logs/spca_mesh_deform_fit_${type}.log +bash bin/ml/spca_run.sh mesh_deform fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/spca_mesh_deform_fit1_${type}.log +bash bin/ml/spca_run.sh mesh_deform fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/spca_mesh_deform_fit2_${type}.log +bash bin/ml/spca_run.sh mesh_deform fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/spca_mesh_deform_fit3_${type}.log + +bash bin/ml/spca_run.sh wathen100 fit ${is_raw} ${if_check} 2>&1 | tee -a logs/spca_wathen100_fit_${type}.log +bash bin/ml/spca_run.sh wathen100 fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/spca_wathen100_fit1_${type}.log +bash bin/ml/spca_run.sh wathen100 fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/spca_wathen100_fit2_${type}.log +bash bin/ml/spca_run.sh wathen100 fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/spca_wathen100_fit3_${type}.log + +bash bin/ml/spca_run.sh MOLIERE fit ${is_raw} ${if_check} 2>&1 | tee -a logs/spca_MOLIERE_fit_${type}.log +bash bin/ml/spca_run.sh MOLIERE fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/spca_MOLIERE_fit1_${type}.log +bash bin/ml/spca_run.sh MOLIERE fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/spca_MOLIERE_fit2_${type}.log +bash bin/ml/spca_run.sh MOLIERE fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/spca_MOLIERE_fit3_${type}.log + +#spearman +bash bin/ml/spearman_run.sh dataframe CP10M1K ${is_raw} ${if_check} 2>&1 | tee -a logs/spearman_CP10M1K_${type}.log +bash bin/ml/spearman_run.sh dataframe CP2M5K ${is_raw} ${if_check} 2>&1 | tee -a logs/spearman_CP2M5K_${type}.log +bash bin/ml/spearman_run.sh dataframe CP1M10K ${is_raw} ${if_check} 2>&1 | tee -a logs/spearman_CP1M10K_${type}.log + +#svd +bash bin/ml/svd_run.sh D10M1K ${is_raw} ${if_check} 2>&1 | tee -a logs/svd_D10M1K_${type}.log +bash bin/ml/svd_run.sh D1M10K ${is_raw} ${if_check} 2>&1 | tee -a logs/svd_D1M10K_${type}.log +bash bin/ml/svd_run.sh MESH ${is_raw} ${if_check} 2>&1 | tee -a logs/svd_MESH_${type}.log +bash bin/ml/svd_run.sh RUCCI ${is_raw} ${if_check} 2>&1 | tee -a logs/svd_RUCCI_${type}.log + + +#svm +bash bin/ml/svm_run.sh ECBDL14 fit ${is_raw} ${if_check} 2>&1 | tee -a logs/svm_ECBDL14_fit_${type}.log +bash bin/ml/svm_run.sh ECBDL14 fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/svm_ECBDL14_fit1_${type}.log +bash bin/ml/svm_run.sh ECBDL14 fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/svm_ECBDL14_fit2_${type}.log +bash bin/ml/svm_run.sh ECBDL14 fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/svm_ECBDL14_fit3_${type}.log + +bash bin/ml/svm_run.sh epsilon fit ${is_raw} ${if_check} 2>&1 | tee -a logs/svm_epsilon_fit_${type}.log +bash bin/ml/svm_run.sh epsilon fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/svm_epsilon_fit1_${type}.log +bash bin/ml/svm_run.sh epsilon fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/svm_epsilon_fit2_${type}.log +bash bin/ml/svm_run.sh epsilon fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/svm_epsilon_fit3_${type}.log + +bash bin/ml/svm_run.sh rcv fit ${is_raw} ${if_check} 2>&1 | tee -a logs/svm_rcv_fit_${type}.log +bash bin/ml/svm_run.sh rcv fit1 ${is_raw} ${if_check} 2>&1 | tee -a logs/svm_rcv_fit1_${type}.log +bash bin/ml/svm_run.sh rcv fit2 ${is_raw} ${if_check} 2>&1 | tee -a logs/svm_rcv_fit2_${type}.log +bash bin/ml/svm_run.sh rcv fit3 ${is_raw} ${if_check} 2>&1 | tee -a logs/svm_rcv_fit3_${type}.log + +# te +bash bin/ml/te_run.sh movielens ${is_raw} ${if_check} 2>&1 | tee -a logs/te_movielens_${type}.log +bash bin/ml/te_run.sh taobao ${is_raw} ${if_check} 2>&1 | tee -a logs/te_taobao_${type}.log +bash bin/ml/te_run.sh criteo40m ${is_raw} ${if_check} 2>&1 | tee -a logs/te_criteo40m_${type}.log +bash bin/ml/te_run.sh criteo150m ${is_raw} ${if_check} 2>&1 | tee -a logs/te_criteo150m_${type}.log + +# word2vec +bash bin/ml/word2vec_run.sh cate fit ${is_raw} 2>&1 | tee -a logs/word2vec_cate_fit_${type}.log +bash bin/ml/word2vec_run.sh cate fit1 ${is_raw} 2>&1 | tee -a logs/word2vec_cate_fit1_${type}.log +bash bin/ml/word2vec_run.sh cate fit2 ${is_raw} 2>&1 | tee -a logs/word2vec_cate_fit2_${type}.log +bash bin/ml/word2vec_run.sh cate fit3 ${is_raw} 2>&1 | tee -a logs/word2vec_cate_fit3_${type}.log + +bash bin/ml/word2vec_run.sh item fit ${is_raw} 2>&1 | tee -a logs/word2vec_item_fit_${type}.log +bash bin/ml/word2vec_run.sh item fit1 ${is_raw} 2>&1 | tee -a logs/word2vec_item_fit1_${type}.log +bash bin/ml/word2vec_run.sh item fit2 ${is_raw} 2>&1 | tee -a logs/word2vec_item_fit2_${type}.log +bash bin/ml/word2vec_run.sh item fit3 ${is_raw} 2>&1 | tee -a logs/word2vec_item_fit3_${type}.log + +bash bin/ml/word2vec_run.sh node fit ${is_raw} 2>&1 | tee -a logs/word2vec_node_fit_${type}.log +bash bin/ml/word2vec_run.sh node fit1 ${is_raw} 2>&1 | tee -a logs/word2vec_node_fit1_${type}.log +bash bin/ml/word2vec_run.sh node fit2 ${is_raw} 2>&1 | tee -a logs/word2vec_node_fit2_${type}.log +bash bin/ml/word2vec_run.sh node fit3 ${is_raw} 2>&1 | tee -a logs/word2vec_node_fit3_${type}.log + +bash bin/ml/word2vec_run.sh taobao fit ${is_raw} 2>&1 | tee -a logs/word2vec_taobao_fit_${type}.log +bash bin/ml/word2vec_run.sh taobao fit1 ${is_raw} 2>&1 | tee -a logs/word2vec_taobao_fit1_${type}.log +bash bin/ml/word2vec_run.sh taobao fit2 ${is_raw} 2>&1 | tee -a logs/word2vec_taobao_fit2_${type}.log +bash bin/ml/word2vec_run.sh taobao fit3 ${is_raw} 2>&1 | tee -a logs/word2vec_taobao_fit3_${type}.log + +#xgbt +bash bin/ml/xgbt_run.sh higgs classification ${is_raw} ${if_check} 2>&1 | tee -a logs/xgbtc_higgs_${type}.log +bash bin/ml/xgbt_run.sh higgs regression ${is_raw} ${if_check} 2>&1 | tee -a logs/xgbtr_higgs_${type}.log + +bash bin/ml/xgbt_run.sh mnist8m classification ${is_raw} ${if_check} 2>&1 | tee -a logs/xgbtc_mnist8m_${type}.log +bash bin/ml/xgbt_run.sh mnist8m regression ${is_raw} ${if_check} 2>&1 | tee -a logs/xgbtr_mnist8m_${type}.log \ No newline at end of file diff --git a/tools/kal-test/bin/preprocess/graph/incpr_data_process.sh b/tools/kal-test/bin/preprocess/graph/incpr_data_process.sh new file mode 100644 index 0000000..efcaba4 --- /dev/null +++ b/tools/kal-test/bin/preprocess/graph/incpr_data_process.sh @@ -0,0 +1,59 @@ +#!/bin/bash +set -e + +function alg_usage() { + echo "Usage: " + echo "1st argument: name of input dataset: twitter" + echo "2nd argument: name of output dataset: twitter_2010" +} + +case "$1" in +-h | --help | ?) + alg_usage + exit 0 + ;; +esac + +if [ $# -gt 2 ]; then + alg_usage + exit 0 +fi + +input=${1:-twitter} +output=${2:-twitter_2010} + +source conf/graph/graph_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +inputPath=${!input} +outPath=${!output} +split="," +seed=1 +iterNum=100 +resetProb=0.15 +partition=273 + +for rate in "0.001" "0.01" "0.05" +do + echo ">>> start twitter-2010-$rate" + output_incData=${outPath}_${rate} + hdfs dfs -rm -r -f ${output_incData}* + spark-submit \ + --class com.bigdata.preprocess.graph.IncDataGeneratorBatch \ + --master yarn \ + --num-executors 39 \ + --executor-memory 23g \ + --executor-cores 7 \ + --driver-memory 80g \ + --conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ + --conf spark.kryoserializer.buffer=2040m \ + --jars "lib/boostkit-graph-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar" \ + ./lib/kal-test_${scala_version_val}-0.1.jar yarn ${inputPath} ${split} ${output_incData} $rate $partition $seed $iterNum $resetProb 5 | tee ./log/log + echo ">>> end twitter-2010-$rate" +done \ No newline at end of file diff --git a/tools/kal-test/bin/preprocess/graph/tpr_data_process.sh b/tools/kal-test/bin/preprocess/graph/tpr_data_process.sh new file mode 100644 index 0000000..1086918 --- /dev/null +++ b/tools/kal-test/bin/preprocess/graph/tpr_data_process.sh @@ -0,0 +1,36 @@ +#!/bin/bash +set -e + +case "$1" in +-h | --help | ?) + echo "Usage: " + echo "1st argument: inputPath of dataset" + echo "2nd argument: outPath of dataset" + exit 0 + ;; +esac + +if [ $# -ne 2 ]; then + echo "please input 2 arguments: " + echo "1st argument: inputPath of dataset" + echo "2nd argument: outPath of dataset" + exit 0 +fi + +inputPath=$1 +outPath=$2 + +source conf/graph/graph_datasets.properties +scala_version=scalaVersion +scala_version_val=${!scala_version} + +spark-submit \ +--class com.bigdata.preprocess.graph.TrillionPageRankDataProcess \ +--master yarn \ +--num-executors 39 \ +--executor-memory 23g \ +--executor-cores 7 \ +--driver-memory 80g \ +--conf spark.serializer=org.apache.spark.serializer.KryoSerializer \ +--conf spark.kryoserializer.buffer=2040m \ +./lib/kal-test_${scala_version_val}-0.1.jar ${inputPath} ${outPath} | tee ./log/log \ No newline at end of file diff --git a/tools/kal-test/bin/preprocess/ml/encoder_data_gen.sh b/tools/kal-test/bin/preprocess/ml/encoder_data_gen.sh new file mode 100644 index 0000000..7bfaa92 --- /dev/null +++ b/tools/kal-test/bin/preprocess/ml/encoder_data_gen.sh @@ -0,0 +1,58 @@ +mapPath=./datasets/featureMap_400m.json +dataPath=hdfs:///tmp/ml/dataset/encoder/encoder_400m +num_executors=71 +executor_cores=4 +executor_memory=12 + +function createDir() { + dir=$1 + if [ ! -d $dir ]; then + mkdir $dir + fi +} +createDir datasets +hadoop fs -mkdir -p ${dataPath} +hadoop fs -rm -r ${dataPath} + +cpu_name=$(lscpu | grep Architecture | awk '{print $2}') + +source conf/ml/ml_datasets.properties +spark_version=sparkVersion +spark_version_val=${!spark_version} +kal_version=kalVersion +kal_version_val=${!kal_version} +scala_version=scalaVersion +scala_version_val=${!scala_version} + +echo "start to clean cache and sleep 3s" +ssh server1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent1 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent2 "echo 3 > /proc/sys/vm/drop_caches" +ssh agent3 "echo 3 > /proc/sys/vm/drop_caches" +sleep 3 + +# --conf spark.executor.extraJavaOptions="-Xms${executor_memory}g" \ + +spark-submit \ +--class com.bigdata.preprocess.ml.EncoderDataGenRun \ +--jars ./lib/boostkit-ml-kernel-${scala_version_val}-${kal_version_val}-${spark_version_val}-${cpu_name}.jar \ +--driver-java-options "-Dhdp.version=3.1.0.0-78" \ +--master yarn \ +--num-executors ${num_executors} \ +--executor-cores ${executor_cores} \ +--executor-memory ${executor_memory}g \ +--conf spark.rdd.compress=false \ +--conf spark.eventLog.enabled=true \ +--conf spark.driver.maxResultSize=40g \ +--conf spark.network.timeout=60s \ +--conf "spark.driver.extraJavaOptions=-Xss5g -Dlog4j.configuration=file:./log4j.properties" \ +./lib/kal-test_${scala_version_val}-0.1.jar \ +--mapPath ${mapPath} \ +--dataPath ${dataPath} \ +--numSamples 400000000 + +# --conf spark.executor.memoryOverhead=2048 \ +#--driver-java-options "-Xms15g" \ +#--conf spark.driver.cores=36 \ +#--conf spark.driver.memory=50g \ +#--conf spark.sql.cbo.enabled=true \ diff --git a/tools/kal-test/conf/graph/betweenness/betweenness.yml b/tools/kal-test/conf/graph/betweenness/betweenness.yml new file mode 100644 index 0000000..da8ca73 --- /dev/null +++ b/tools/kal-test/conf/graph/betweenness/betweenness.yml @@ -0,0 +1,17 @@ +# Betweenness dataset parameters arm + +betweenness: + cit_patents: + splitGraph: "\t" + p: 0.156 + k: 10000 + + enwiki_2018: + splitGraph: "\t" + p: 0.005 + k: 10000 + + uk_2002: + splitGraph: "\t" + p: 0.0014 + k: 10000 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/betweenness/betweenness_spark.properties b/tools/kal-test/conf/graph/betweenness/betweenness_spark.properties new file mode 100644 index 0000000..dba4729 --- /dev/null +++ b/tools/kal-test/conf/graph/betweenness/betweenness_spark.properties @@ -0,0 +1,53 @@ +# Spark parameters +deployMode=client +driverMemory=80g + +# opt +cit_patents_numExecutors_aarch64=71 +cit_patents_executorCores_aarch64=4 +cit_patents_executorMemory_aarch64=12G +cit_patents_numPartitions_aarch64=240 +cit_patents_BetweennessPartNum_aarch64=864 +cit_patents_ThreadNum_aarch64=2 +cit_patents_SparkTaskCpus_aarch64=2 + +enwiki_2018_numExecutors_aarch64=71 +enwiki_2018_executorCores_aarch64=4 +enwiki_2018_executorMemory_aarch64=12G +enwiki_2018_numPartitions_aarch64=576 +enwiki_2018_BetweennessPartNum_aarch64=3168 +enwiki_2018_ThreadNum_aarch64=2 +enwiki_2018_SparkTaskCpus_aarch64=2 + +uk_2002_numExecutors_aarch64=71 +uk_2002_executorCores_aarch64=4 +uk_2002_executorMemory_aarch64=12G +uk_2002_numPartitions_aarch64=240 +uk_2002_BetweennessPartNum_aarch64=864 +uk_2002_ThreadNum_aarch64=2 +uk_2002_SparkTaskCpus_aarch64=2 + +# raw +cit_patents_numExecutors_x86_64=29 +cit_patents_executorCores_x86_64=8 +cit_patents_executorMemory_x86_64=31G +cit_patents_numPartitions_x86_64=240 +cit_patents_pivots=10300 +cit_patents_iteration=10000 +cit_patents_graphSplit="\t" + +enwiki_2018_numExecutors_x86_64=59 +enwiki_2018_executorCores_x86_64=4 +enwiki_2018_executorMemory_x86_64=15G +enwiki_2018_numPartitions_x86_64=240 +enwiki_2018_pivots=30 +enwiki_2018_iteration=10 +enwiki_2018_graphSplit="\t" + +uk_2002_numExecutors_x86_64=23 +uk_2002_executorCores_x86_64=10 +uk_2002_executorMemory_x86_64=38G +uk_2002_numPartitions_x86_64=240 +uk_2002_pivots=35 +uk_2002_iteration=13 +uk_2002_graphSplit="\t" \ No newline at end of file diff --git a/tools/kal-test/conf/graph/bfs/bfs.yml b/tools/kal-test/conf/graph/bfs/bfs.yml new file mode 100644 index 0000000..ed3c812 --- /dev/null +++ b/tools/kal-test/conf/graph/bfs/bfs.yml @@ -0,0 +1,32 @@ +# BFS dataset parameters + +bfs: + cit_patents: + splitGraph: "\t" + isDirect: true + depthLimit: 10 + + enwiki_2018: + splitGraph: "\t" + isDirect: true + depthLimit: 10 + + arabic_2005: + splitGraph: "\t" + isDirect: true + depthLimit: 10 + + graph500_22: + splitGraph: " " + isDirect: false + depthLimit: 10 + + graph500_23: + splitGraph: " " + isDirect: false + depthLimit: 10 + + graph500_25: + splitGraph: " " + isDirect: false + depthLimit: 10 diff --git a/tools/kal-test/conf/graph/bfs/bfs_source_id.properties b/tools/kal-test/conf/graph/bfs/bfs_source_id.properties new file mode 100644 index 0000000..4e04763 --- /dev/null +++ b/tools/kal-test/conf/graph/bfs/bfs_source_id.properties @@ -0,0 +1,13 @@ +cit_patents_SourceID=5907828 +enwiki_2018_SourceID=746 +arabic_2005_SourceID=1670 +graph500_22_SourceID=801590 +graph500_23_SourceID=7343958 +graph500_25_SourceID=2984368 + +#cit_patents_SourceID=5907828,5415318,5187336,5833578,4392749,4841212,5076293,4040028,4372582,4795119 +#enwiki_2018_SourceID=746,8772,21772,120250,622934,1551636,3965549,1604602,1604609,1604631 +#arabic_2005_SourceID=1670,51860,152713,241498,4301061,10012301,22633966,22671309,22671985,11919106 +#graph500_22_SourceID=801590,2673237,254471,1610346,3625091,2079512,2999118,2079626,2788395,644668,2404500,2098295,2353233,3009625,1273090 +#graph500_23_SourceID=7343958,1320614,2134798,4771523,5113815,7826138,468980,3608955,7634969,4120652,7935287,4633288,2703467,4672090,710114,1036350,6538582,7410401,7713575,5723238 +#graph500_25_SourceID=2984368,28051905,23288850,8133358,30465165,32077192,3588225,10567776,15564773,22821566,16473042,1057188,3618262,10779632,31184653,10803749,30374117,3015466,18845296,20096970,15527064,25432179,12294489,25461250,16997339,798169 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/bfs/bfs_spark.properties b/tools/kal-test/conf/graph/bfs/bfs_spark.properties new file mode 100644 index 0000000..511ce9a --- /dev/null +++ b/tools/kal-test/conf/graph/bfs/bfs_spark.properties @@ -0,0 +1,78 @@ +# Spark parameters +deployMode=client + +# opt +cit_patents_numExecutors_aarch64=35 +cit_patents_executorCores_aarch64=8 +cit_patents_executorMemory_aarch64=25G +cit_patents_numPartitions_aarch64=280 + +enwiki_2018_numExecutors_aarch64=35 +enwiki_2018_executorCores_aarch64=8 +enwiki_2018_executorMemory_aarch64=25G +enwiki_2018_numPartitions_aarch64=280 + +arabic_2005_numExecutors_aarch64=35 +arabic_2005_executorCores_aarch64=8 +arabic_2005_executorMemory_aarch64=25G +arabic_2005_numPartitions_aarch64=280 + +graph500_22_numExecutors_aarch64=35 +graph500_22_executorCores_aarch64=8 +graph500_22_executorMemory_aarch64=25G +graph500_22_numPartitions_aarch64=280 + +graph500_23_numExecutors_aarch64=35 +graph500_23_executorCores_aarch64=8 +graph500_23_executorMemory_aarch64=25G +graph500_23_numPartitions_aarch64=280 + +graph500_25_numExecutors_aarch64=35 +graph500_25_executorCores_aarch64=8 +graph500_25_executorMemory_aarch64=25G +graph500_25_numPartitions_aarch64=560 + +# raw +cit_patents_numExecutors_x86_64=11 +cit_patents_executorCores_x86_64=20 +cit_patents_executorMemory_x86_64=78G +cit_patents_numPartitions_x86_64=220 + +enwiki_2018_numExecutors_x86_64=11 +enwiki_2018_executorCores_x86_64=20 +enwiki_2018_executorMemory_x86_64=78G +enwiki_2018_numPartitions_x86_64=220 + +arabic_2005_numExecutors_x86_64=29 +arabic_2005_executorCores_x86_64=8 +arabic_2005_executorMemory_x86_64=31G +arabic_2005_numPartitions_x86_64=232 + +graph500_22_numExecutors_x86_64=12 +graph500_22_executorCores_x86_64=19 +graph500_22_executorMemory_x86_64=78G +graph500_22_numPartitions_x86_64=228 + +graph500_23_numExecutors_x86_64=11 +graph500_23_executorCores_x86_64=20 +graph500_23_executorMemory_x86_64=78G +graph500_23_numPartitions_x86_64=508 + +graph500_25_numExecutors_x86_64=39 +graph500_25_executorCores_x86_64=6 +graph500_25_executorMemory_x86_64=23G +graph500_25_numPartitions_x86_64=468 + +# raw +cit_patents_split="\t" +cit_patents_q=true +enwiki_2018_split="\t" +enwiki_2018_q=true +arabic_2005_split="\t" +arabic_2005_q=true +graph500_22_split=" " +graph500_22_q=false +graph500_23_split="," +graph500_23_q=false +graph500_25_split=" " +graph500_25_q=false \ No newline at end of file diff --git a/tools/kal-test/conf/graph/cc/cc.yml b/tools/kal-test/conf/graph/cc/cc.yml new file mode 100644 index 0000000..bcb0c9e --- /dev/null +++ b/tools/kal-test/conf/graph/cc/cc.yml @@ -0,0 +1,14 @@ +partition: + liveJournal_x86_64_yes: 210 + liveJournal_x86_64_no: 78 + liveJournal_aarch64_no: 78 + graph500_25_x86_64_yes: 210 + graph500_25_x86_64_no: 210 + graph500_25_aarch64_no: 210 + graph500_26_x86_64_yes: 210 + graph500_26_x86_64_no: 210 + graph500_26_aarch64_no: 210 +split: + liveJournal: "\t" + graph500_25: " " + graph500_26: " " \ No newline at end of file diff --git a/tools/kal-test/conf/graph/cc/cc_spark.properties b/tools/kal-test/conf/graph/cc/cc_spark.properties new file mode 100644 index 0000000..f8e6002 --- /dev/null +++ b/tools/kal-test/conf/graph/cc/cc_spark.properties @@ -0,0 +1,19 @@ +master=yarn +deployMode=client +driverMemory=200g + +numExecutors_aarch64=71 +executorMemory_aarch64=12g +executorCores_aarch64=4 +executorExtraJavaopts_aarch64="-Xms12g" +defaultParallelism_liveJournal_aarch64=78 +defaultParallelism_graph500_25_aarch64=230 +defaultParallelism_graph500_26_aarch64=280 + +numExecutors_x86_64=12 +executorMemory_x86_64=78g +executorCores_x86_64=18 +executorExtraJavaopts_x86_64="-Xms78g" +defaultParallelism_liveJournal_x86_64=470 +defaultParallelism_graph500_25_x86_64=210 +defaultParallelism_graph500_26_x86_64=460 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/cd/cd.yml b/tools/kal-test/conf/graph/cd/cd.yml new file mode 100644 index 0000000..594d083 --- /dev/null +++ b/tools/kal-test/conf/graph/cd/cd.yml @@ -0,0 +1,18 @@ +partition: + simulate1_x86_64_yes: 228 + simulate1_x86_64_no: 236 + simulate1_aarch64_no: 284 + simulate2_x86_64_yes: 232 + simulate2_x86_64_no: 236 + simulate2_aarch64_no: 284 + usaRoad_x86_64_yes: 228 + usaRoad_x86_64_no: 228 + usaRoad_aarch64_no: 282 +split: + simulate1: "," + simulate2: "," + usaRoad: " " +minLoopLen: 3 +maxLoopLen: 10 +minRate: 0.8 +maxRate: 1.2 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/cd/cd_spark.properties b/tools/kal-test/conf/graph/cd/cd_spark.properties new file mode 100644 index 0000000..7346280 --- /dev/null +++ b/tools/kal-test/conf/graph/cd/cd_spark.properties @@ -0,0 +1,33 @@ +master=yarn +deployMode=client +driverMemory=80g + +numExecutors_simulate1_aarch64=71 +executorMemory_simulate1_aarch64=12g +executorCores_simulate1_aarch64=4 +executorExtraJavaopts_simulate1_aarch64="-Xms12g" + +numExecutors_simulate2_aarch64=71 +executorMemory_simulate2_aarch64=12g +executorCores_simulate2_aarch64=4 +executorExtraJavaopts_simulate2_aarch64="-Xms12g" + +numExecutors_usaRoad_aarch64=47 +executorMemory_usaRoad_aarch64=19g +executorCores_usaRoad_aarch64=6 +executorExtraJavaopts_usaRoad_aarch64="-Xms19g" + +numExecutors_simulate1_x86_64=59 +executorMemory_simulate1_x86_64=15g +executorCores_simulate1_x86_64=4 +executorExtraJavaopts_simulate1_x86_64="-Xms15g" + +numExecutors_simulate2_x86_64=59 +executorMemory_simulate2_x86_64=15g +executorCores_simulate2_x86_64=4 +executorExtraJavaopts_simulate2_x86_64="-Xms15g" + +numExecutors_usaRoad_x86_64=12 +executorMemory_usaRoad_x86_64=78g +executorCores_usaRoad_x86_64=19 +executorExtraJavaopts_usaRoad_x86_64="-Xms78g" \ No newline at end of file diff --git a/tools/kal-test/conf/graph/closeness/closeness_spark.properties b/tools/kal-test/conf/graph/closeness/closeness_spark.properties new file mode 100644 index 0000000..50ddd0f --- /dev/null +++ b/tools/kal-test/conf/graph/closeness/closeness_spark.properties @@ -0,0 +1,79 @@ +# Spark parameters +deployMode=client +outputNodeNum=10000 +driverMemory=80g +split="\\s+" + +# opt +cit_patents_weighted_numExecutors_aarch64=71 +cit_patents_weighted_executorMemory_aarch64=12G +cit_patents_weighted_executorCores_aarch64=4 +cit_patents_weighted_numPartitions_aarch64=288 +cit_patents_weighted_ratio_aarch64=0.00001 + +cit_patents_unweighted_numExecutors_aarch64=71 +cit_patents_unweighted_executorMemory_aarch64=12G +cit_patents_unweighted_executorCores_aarch64=4 +cit_patents_unweighted_numPartitions_aarch64=288 +cit_patents_unweighted_ratio_aarch64=0.00001 + +graph500_23_weighted_numExecutors_aarch64=71 +graph500_23_weighted_executorMemory_aarch64=12G +graph500_23_weighted_executorCores_aarch64=4 +graph500_23_weighted_numPartitions_aarch64=288 +graph500_23_weighted_ratio_aarch64=0.0001 + +graph500_23_unweighted_numExecutors_aarch64=71 +graph500_23_unweighted_executorMemory_aarch64=12G +graph500_23_unweighted_executorCores_aarch64=4 +graph500_23_unweighted_numPartitions_aarch64=288 +graph500_23_unweighted_ratio_aarch64=0.0001 + +uk_2002_weighted_numExecutors_aarch64=35 +uk_2002_weighted_executorMemory_aarch64=25G +uk_2002_weighted_executorCores_aarch64=8 +uk_2002_weighted_numPartitions_aarch64=288 +uk_2002_weighted_ratio_aarch64=0.001 + +uk_2002_unweighted_numExecutors_aarch64=71 +uk_2002_unweighted_executorMemory_aarch64=12G +uk_2002_unweighted_executorCores_aarch64=4 +uk_2002_unweighted_numPartitions_aarch64=288 +uk_2002_unweighted_ratio_aarch64=0.001 + +# raw +cit_patents_weighted_numExecutors_x86_64=18 +cit_patents_weighted_executorMemory_x86_64=52G +cit_patents_weighted_executorCores_x86_64=12 +cit_patents_weighted_numPartitions_x86_64=240 +cit_patents_weighted_ratio_x86_64=0.00001 + +cit_patents_unweighted_numExecutors_x86_64=18 +cit_patents_unweighted_executorMemory_x86_64=52G +cit_patents_unweighted_executorCores_x86_64=12 +cit_patents_unweighted_numPartitions_x86_64=240 +cit_patents_unweighted_ratio_x86_64=0.00001 + +graph500_23_weighted_numExecutors_x86_64=59 +graph500_23_weighted_executorMemory_x86_64=15G +graph500_23_weighted_executorCores_x86_64=4 +graph500_23_weighted_numPartitions_x86_64=480 +graph500_23_weighted_ratio_x86_64=0.0001 + +graph500_23_unweighted_numExecutors_x86_64=29 +graph500_23_unweighted_executorMemory_x86_64=31G +graph500_23_unweighted_executorCores_x86_64=8 +graph500_23_unweighted_numPartitions_x86_64=240 +graph500_23_unweighted_ratio_x86_64=0.0001 + +uk_2002_weighted_numExecutors_x86_64=59 +uk_2002_weighted_executorMemory_x86_64=15G +uk_2002_weighted_executorCores_x86_64=4 +uk_2002_weighted_numPartitions_x86_64=240 +uk_2002_weighted_ratio_x86_64=0.0001 + +uk_2002_unweighted_numExecutors_x86_64=29 +uk_2002_unweighted_executorMemory_x86_64=31G +uk_2002_unweighted_executorCores_x86_64=8 +uk_2002_unweighted_numPartitions_x86_64=240 +uk_2002_unweighted_ratio_x86_64=0.0001 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/clusteringcoefficient/clusteringcoefficient.yml b/tools/kal-test/conf/graph/clusteringcoefficient/clusteringcoefficient.yml new file mode 100644 index 0000000..09b76d4 --- /dev/null +++ b/tools/kal-test/conf/graph/clusteringcoefficient/clusteringcoefficient.yml @@ -0,0 +1,38 @@ +clusteringCoefficient: + opt: + cit_patents: + splitGraph: "\t" + isDirect: true + + uk_2002: + splitGraph: "\t" + isDirect: true + + arabic_2005: + splitGraph: "\t" + isDirect: true + + graph500_22: + splitGraph: " " + isDirect: false + + graph500_23: + splitGraph: " " + isDirect: false + + graph500_24: + splitGraph: " " + isDirect: false + + graph500_25: + splitGraph: " " + isDirect: false + + raw: + graph500_22: + splitGraph: " " + isDirect: false + + graph500_23: + splitGraph: " " + isDirect: false \ No newline at end of file diff --git a/tools/kal-test/conf/graph/clusteringcoefficient/clusteringcoefficient_spark.properties b/tools/kal-test/conf/graph/clusteringcoefficient/clusteringcoefficient_spark.properties new file mode 100644 index 0000000..2bcf72f --- /dev/null +++ b/tools/kal-test/conf/graph/clusteringcoefficient/clusteringcoefficient_spark.properties @@ -0,0 +1,60 @@ +deployMode=client +driverMemory=50g +master=yarn + +# opt +cit_patents_numExecutors_aarch64=71 +cit_patents_executorCores_aarch64=4 +cit_patents_executorMemory_aarch64=12G +cit_patents_numPartitions_aarch64=720 + +uk_2002_numExecutors_aarch64=71 +uk_2002_executorCores_aarch64=4 +uk_2002_executorMemory_aarch64=12G +uk_2002_numPartitions_aarch64=720 + +arabic_2005_numExecutors_aarch64=71 +arabic_2005_executorCores_aarch64=4 +arabic_2005_executorMemory_aarch64=12G +arabic_2005_numPartitions_aarch64=720 + +graph500_22_numExecutors_aarch64=71 +graph500_22_executorCores_aarch64=4 +graph500_22_executorMemory_aarch64=12G +graph500_22_numPartitions_aarch64=720 + +graph500_23_numExecutors_aarch64=71 +graph500_23_executorCores_aarch64=4 +graph500_23_executorMemory_aarch64=12G +graph500_23_numPartitions_aarch64=720 + +graph500_24_numExecutors_aarch64=71 +graph500_24_executorCores_aarch64=4 +graph500_24_executorMemory_aarch64=12G +graph500_24_numPartitions_aarch64=720 + +graph500_25_numExecutors_aarch64=71 +graph500_25_executorCores_aarch64=4 +graph500_25_executorMemory_aarch64=12G +graph500_25_numPartitions_aarch64=720 + +# raw +graph500_22_numExecutors_x86_64=29 +graph500_22_executorCores_x86_64=8 +graph500_22_executorMemory_x86_64=35G +graph500_22_numPartitions_x86_64=232 + +graph500_23_numExecutors_x86_64=29 +graph500_23_executorCores_x86_64=8 +graph500_23_executorMemory_x86_64=35G +graph500_23_numPartitions_x86_64=300 + +graph500_24_numExecutors_x86_64=29 +graph500_24_executorCores_x86_64=8 +graph500_24_executorMemory_x86_64=35G +graph500_24_numPartitions_x86_64=300 + +graph500_25_numExecutors_x86_64=29 +graph500_25_executorCores_x86_64=8 +graph500_25_executorMemory_x86_64=35G +graph500_25_numPartitions_x86_64=300 diff --git a/tools/kal-test/conf/graph/deepwalk/deepwalk.yml b/tools/kal-test/conf/graph/deepwalk/deepwalk.yml new file mode 100644 index 0000000..147c38d --- /dev/null +++ b/tools/kal-test/conf/graph/deepwalk/deepwalk.yml @@ -0,0 +1,56 @@ +deepwalk: + cit_patents: + opt: + splitGraph: "," + partitions: 240 + walkLength: 10 + numWalks: 2 + iteration: 1 + dimension: 128 + windowSize: 10 + negativeSample: 5 + + raw: + splitGraph: "," + numIter: 100 + partitions: 236 + resetProb: 0.15 + isOnlySrc: false + + soc_LiveJournal: + opt: + splitGraph: "\t" + partitions: 240 + walkLength: 8 + numWalks: 3 + iteration: 3 + dimension: 128 + windowSize: 5 + negativeSample: 3 + + raw: + splitGraph: "," + numIter: 100 + partitions: 236 + resetProb: 0.15 + isOnlySrc: false + + uk_2002: + opt: + splitGraph: "," + partitions: 240 + walkLength: 8 + numWalks: 2 + iteration: 1 + dimension: 128 + windowSize: 5 + negativeSample: 1 + + raw: + splitGraph: "," + numIter: 100 + partitions: 236 + resetProb: 0.15 + isOnlySrc: false + + diff --git a/tools/kal-test/conf/graph/deepwalk/deepwalk_spark.properties b/tools/kal-test/conf/graph/deepwalk/deepwalk_spark.properties new file mode 100644 index 0000000..0ecc2d0 --- /dev/null +++ b/tools/kal-test/conf/graph/deepwalk/deepwalk_spark.properties @@ -0,0 +1,56 @@ +deployMode=client +#opt +numExectuors_cit_patents_aarch64=3 +executorCores_cit_patents_aarch64=94 +executorMemory_cit_patents_aarch64=315G +extraJavaOptions_cit_patents_aarch64=-Xms315G + +numExectuors_cit_patents_x86_64=3 +executorCores_cit_patents_x86_64=78 +executorMemory_cit_patents_x86_64=315G +extraJavaOptions_cit_patents_x86_64=-Xms315G + +numExectuors_soc_LiveJournal_aarch64=3 +executorCores_soc_LiveJournal_aarch64=94 +executorMemory_soc_LiveJournal_aarch64=315G +extraJavaOptions_soc_LiveJournal_aarch64=-Xms315G + +numExectuors_soc_LiveJournal_x86_64=3 +executorCores_soc_LiveJournal_x86_64=78 +executorMemory_soc_LiveJournal_x86_64=315G +extraJavaOptions_soc_LiveJournal_x86_64=-Xms315G + +numExectuors_uk_2002_aarch64=3 +executorCores_uk_2002_aarch64=94 +executorMemory_uk_2002_aarch64=315G +extraJavaOptions_uk_2002_aarch64=-Xms315G + +numExectuors_uk_2002_x86_64=3 +executorCores_uk_2002_x86_64=78 +executorMemory_uk_2002_x86_64=315G +extraJavaOptions_uk_2002_x86_64=-Xms315G + +#raw +walkLength_cit_patents_x86_64=10 +numWalks_cit_patents_x86_64=5 +dimension_cit_patents_x86_64=128 +partitions_cit_patents_x86_64=360 +iteration_cit_patents_x86_64=1 +windowSize_cit_patents_x86_64=5 +splitGraph_cit_patents_x86_64="," + +walkLength_soc_LiveJournal_x86_64=10 +numWalks_soc_LiveJournal_x86_64=5 +dimension_soc_LiveJournal_x86_64=128 +partitions_soc_LiveJournal_x86_64=240 +iteration_soc_LiveJournal_x86_64=1 +windowSize_soc_LiveJournal_x86_64=10 +splitGraph_soc_LiveJournal_x86_64="\t" + +walkLength_uk_2002_x86_64=10 +numWalks_uk_2002_x86_64=1 +dimension_uk_2002_x86_64=128 +partitions_uk_2002_x86_64=240 +iteration_uk_2002_x86_64=1 +windowSize_uk_2002_x86_64=10 +splitGraph_uk_2002_x86_64="," diff --git a/tools/kal-test/conf/graph/degree/degree.yml b/tools/kal-test/conf/graph/degree/degree.yml new file mode 100644 index 0000000..37b2075 --- /dev/null +++ b/tools/kal-test/conf/graph/degree/degree.yml @@ -0,0 +1,19 @@ +# Degrees dataset parameters +degree: + it_2004: + splitGraph: " " + + twitter7: + splitGraph: " " + + uk_2007_05: + splitGraph: "\t" + + mycielskian20: + splitGraph: " " + + gap_kron: + splitGraph: " " + + com_friendster: + splitGraph: " " \ No newline at end of file diff --git a/tools/kal-test/conf/graph/degree/degree_spark.properties b/tools/kal-test/conf/graph/degree/degree_spark.properties new file mode 100644 index 0000000..7c3c195 --- /dev/null +++ b/tools/kal-test/conf/graph/degree/degree_spark.properties @@ -0,0 +1,99 @@ +# Spark parameters +deployMode=client + +inDegrees_it_2004_numExecutors_aarch64=35 +inDegrees_it_2004_executorCores_aarch64=8 +inDegrees_it_2004_executorMemory_aarch64=25G +inDegrees_it_2004_numPartitions_aarch64=150 + +inDegrees_twitter7_numExecutors_aarch64=35 +inDegrees_twitter7_executorCores_aarch64=8 +inDegrees_twitter7_executorMemory_aarch64=25G +inDegrees_twitter7_numPartitions_aarch64=150 + +inDegrees_uk_2007_05_numExecutors_aarch64=35 +inDegrees_uk_2007_05_executorCores_aarch64=8 +inDegrees_uk_2007_05_executorMemory_aarch64=25G +inDegrees_uk_2007_05_numPartitions_aarch64=300 + +outDegrees_it_2004_numExecutors_aarch64=35 +outDegrees_it_2004_executorCores_aarch64=8 +outDegrees_it_2004_executorMemory_aarch64=25G +outDegrees_it_2004_numPartitions_aarch64=150 + +outDegrees_twitter7_numExecutors_aarch64=59 +outDegrees_twitter7_executorCores_aarch64=4 +outDegrees_twitter7_executorMemory_aarch64=15G +outDegrees_twitter7_numPartitions_aarch64=240 + +outDegrees_uk_2007_05_numExecutors_aarch64=35 +outDegrees_uk_2007_05_executorCores_aarch64=8 +outDegrees_uk_2007_05_executorMemory_aarch64=25G +outDegrees_uk_2007_05_numPartitions_aarch64=150 + +degrees_mycielskian20_numExecutors_aarch64=47 +degrees_mycielskian20_executorCores_aarch64=6 +degrees_mycielskian20_executorMemory_aarch64=19G +degrees_mycielskian20_numPartitions_aarch64=300 + +degrees_gap_kron_numExecutors_aarch64=35 +degrees_gap_kron_executorCores_aarch64=8 +degrees_gap_kron_executorMemory_aarch64=25G +degrees_gap_kron_numPartitions_aarch64=240 + +degrees_com_friendster_numExecutors_aarch64=35 +degrees_com_friendster_executorCores_aarch64=8 +degrees_com_friendster_executorMemory_aarch64=25G +degrees_com_friendster_numPartitions_aarch64=150 + +inDegrees_it_2004_numExecutors_x86_64=59 +inDegrees_it_2004_executorCores_x86_64=4 +inDegrees_it_2004_executorMemory_x86_64=15G +inDegrees_it_2004_numPartitions_x86_64=300 + +inDegrees_twitter7_numExecutors_x86_64=59 +inDegrees_twitter7_executorCores_x86_64=4 +inDegrees_twitter7_executorMemory_x86_64=15G +inDegrees_twitter7_numPartitions_x86_64=240 + +inDegrees_uk_2007_05_numExecutors_x86_64=39 +inDegrees_uk_2007_05_executorCores_x86_64=6 +inDegrees_uk_2007_05_executorMemory_x86_64=23G +inDegrees_uk_2007_05_numPartitions_x86_64=240 + +outDegrees_it_2004_numExecutors_x86_64=29 +outDegrees_it_2004_executorCores_x86_64=8 +outDegrees_it_2004_executorMemory_x86_64=31G +outDegrees_it_2004_numPartitions_x86_64=240 + +outDegrees_twitter7_numExecutors_x86_64=39 +outDegrees_twitter7_executorCores_x86_64=6 +outDegrees_twitter7_executorMemory_x86_64=23G +outDegrees_twitter7_numPartitions_x86_64=240 + +outDegrees_uk_2007_05_numExecutors_x86_64=59 +outDegrees_uk_2007_05_executorCores_x86_64=4 +outDegrees_uk_2007_05_executorMemory_x86_64=15G +outDegrees_uk_2007_05_numPartitions_x86_64=240 + +degrees_mycielskian20_numExecutors_x86_64=39 +degrees_mycielskian20_executorCores_x86_64=6 +degrees_mycielskian20_executorMemory_x86_64=23G +degrees_mycielskian20_numPartitions_x86_64=240 + +degrees_gap_kron_numExecutors_x86_64=59 +degrees_gap_kron_executorCores_x86_64=4 +degrees_gap_kron_executorMemory_x86_64=15G +degrees_gap_kron_numPartitions_x86_64=240 + +degrees_com_friendster_numExecutors_x86_64=59 +degrees_com_friendster_executorCores_x86_64=4 +degrees_com_friendster_executorMemory_x86_64=15G +degrees_com_friendster_numPartitions_x86_64=150 + +it_2004_splitGraph=" " +twitter7_splitGraph=" " +uk_2007_05_splitGraph="\t" +mycielskian20_splitGraph=" " +gap_kron_splitGraph=" " +com_friendster_splitGraph=" " \ No newline at end of file diff --git a/tools/kal-test/conf/graph/fraudar/fraudar.yml b/tools/kal-test/conf/graph/fraudar/fraudar.yml new file mode 100644 index 0000000..f851a1c --- /dev/null +++ b/tools/kal-test/conf/graph/fraudar/fraudar.yml @@ -0,0 +1,26 @@ +wlpa: + opt: + alpha: + splitGraph: "\\s+" + partitions: 240 + + amazon: + splitGraph: "\\s+" + partitions: 240 + + otc: + splitGraph: "\\s+" + partitions: 240 + + raw: + alpha: + splitGraph: "\\s+" + partitions: 240 + + amazon: + splitGraph: "\\s+" + partitions: 240 + + otc: + splitGraph: "\\s+" + partitions: 240 diff --git a/tools/kal-test/conf/graph/fraudar/fraudar_spark.properties b/tools/kal-test/conf/graph/fraudar/fraudar_spark.properties new file mode 100644 index 0000000..d7873cc --- /dev/null +++ b/tools/kal-test/conf/graph/fraudar/fraudar_spark.properties @@ -0,0 +1,33 @@ +master=yarn +deployMode=client +driverMemory=300g + +numExecutors_alpha_aarch64=12 +executorMemory_alpha_aarch64=78g +executorCores_alpha_aarch64=18 +executorExtraJavaopts_alpha_aarch64="-Xms78g" + +numExecutors_amazon_aarch64=12 +executorMemory_amazon_aarch64=78g +executorCores_amazon_aarch64=18 +executorExtraJavaopts_amazon_aarch64="-Xms78g" + +numExecutors_otc_aarch64=12 +executorMemory_otc_aarch64=78g +executorCores_otc_aarch64=18 +executorExtraJavaopts_otc_aarch64="-Xms78g" + +numExecutors_alpha_x86_64=12 +executorMemory_alpha_x86_64=78g +executorCores_alpha_x86_64=18 +executorExtraJavaopts_alpha_x86_64="-Xms78g" + +numExecutors_amazon_x86_64=12 +executorMemory_amazon_x86_64=78g +executorCores_amazon_x86_64=18 +executorExtraJavaopts_amazon_x86_64="-Xms78g" + +numExecutors_otc_x86_64=12 +executorMemory_otc_x86_64=78g +executorCores_otc_x86_64=18 +executorExtraJavaopts_otc_x86_64="-Xms78g" \ No newline at end of file diff --git a/tools/kal-test/conf/graph/graph_datasets.properties b/tools/kal-test/conf/graph/graph_datasets.properties new file mode 100644 index 0000000..ebbbc81 --- /dev/null +++ b/tools/kal-test/conf/graph/graph_datasets.properties @@ -0,0 +1,83 @@ +# sparkVersion +sparkVersion=spark3.1.1 + +# kalVersion +kalVersion=2.2.0 + +# scalaVersion +scalaVersion=2.12 + +# undirected graph dataset path +graph500_19=hdfs:///tmp/graph/dataset/graph500-19.e +graph500_22=hdfs:///tmp/graph/dataset/graph500-22.e +graph500_23=hdfs:///tmp/graph/dataset/graph500-23.e +graph500_24=hdfs:///tmp/graph/dataset/graph500-24.e +graph500_25=hdfs:///tmp/graph/dataset/graph500-25.e +graph500_26=hdfs:///tmp/graph/dataset/graph500-26.e +liveJournal=hdfs:///tmp/graph/dataset/com-lj.ungraph.txt +mycielskian20=hdfs:///tmp/graph/dataset/mycielskian20.mtx +gap_kron=hdfs:///tmp/graph/dataset/GAP-kron.mtx +com_friendster=hdfs:///tmp/graph/dataset/com-Friendster.mtx +com_orkut=hdfs:///tmp/graph/dataset/com-orkut.ungraph.txt +cage14=hdfs:///tmp/graph/dataset/wpr/cage14.mtx +GAP_road=hdfs:///tmp/graph/dataset/wpr/GAP-road.mtx +GAP_twitter=hdfs:///tmp/graph/dataset/wpr/GAP-twitter.mtx +twitter_2010=hdfs:///tmp/graph/dataset/incData/twitter-2010 +twitter_tpr=hdfs:///tmp/graph/dataset/trillionPageRank +alpha=hdfs:///tmp/graph/dataset/alpha_renumber.txt +amazon=hdfs:///tmp/graph/dataset/amazon_renumber.txt +otc=hdfs:///tmp/graph/dataset/otc_renumber.txt + +# directed graph dataset path +cit_patents=hdfs:///tmp/graph/dataset/cit-Patents.txt +uk_2002=hdfs:///tmp/graph/dataset/uk-2002-edgelist.txt +arabic_2005=hdfs:///tmp/graph/dataset/arabic-2005-edgelist.txt +enwiki_2018=hdfs:///tmp/graph/dataset/enwiki-2018-edgelist.txt +simulate1=hdfs:///tmp/graph/dataset/Simulate1.csv +simulate2=hdfs:///tmp/graph/dataset/Simulate2.csv +usaRoad=hdfs:///tmp/graph/dataset/USA-road-d.USA.gr +it_2004=hdfs:///tmp/graph/dataset/it-2004.mtx +twitter7=hdfs:///tmp/graph/dataset/twitter7.mtx +uk_2007_05=hdfs:///tmp/graph/dataset/uk-2007-05-edgelist.txt +soc_liveJournal=hdfs:///tmp/graph/dataset/soc-LiveJournal1.txt +twitter=hdfs:///tmp/graph/dataset/twitter-2010-edgelist.txt + +# directed graph dataset source path +uk_2002_5=hdfs:///tmp/graph/mssp/source/uk-random_5 +uk_2002_50=hdfs:///tmp/graph/mssp/source/uk-random_50 +arabic_2005_5=hdfs:///tmp/graph/mssp/source/arabic-random_5 +arabic_2005_50=hdfs:///tmp/graph/mssp/source/arabic-random_50 +soc_liveJournal_5=hdfs:///tmp/graph/mssp/source/livejournal-random_5 +soc_liveJournal_50=hdfs:///tmp/graph/mssp/source/livejournal-random_50 + +# groundTruth data path +cit_patents_gt=hdfs:///tmp/graph/dataset/gt/cit_gt.txt +enwiki_2018_gt=hdfs:///tmp/graph/dataset/gt/enwiki_gt.txt +uk_2002_gt=hdfs:///tmp/graph/dataset/gt/uk_gt.txt + +# closeness gt path +closeness_gt_cit_patents=hdfs:///tmp/graph/dataset/closeness/cit-gt.txt +closeness_gt_uk_2002=hdfs:///tmp/graph/dataset/closeness/uk-gt.txt + +# negEdge data path +cit_patents_negEdge=hdfs:///tmp/graph/dataset/cit-Patents-negEdges.txt +soc_liveJournal_negEdge=hdfs:///tmp/graph/dataset/soc-LiveJournal1-negEdges.txt +uk_2002_negEdge=hdfs:///tmp/graph/dataset/uk-2002-edgelist-negEdges.txt + +# sgm query graph path +query_4clique=hdfs:///tmp/graph/dataset/sgmQueryGraph/4clique.txt +query_4sqr=hdfs:///tmp/graph/dataset/sgmQueryGraph/4sqr.txt +query_4dgn=hdfs:///tmp/graph/dataset/sgmQueryGraph/4sqr-dgn.txt +query_5clique=hdfs:///tmp/graph/dataset/sgmQueryGraph/5clique.txt +query_5tree=hdfs:///tmp/graph/dataset/sgmQueryGraph/5tree.txt +query_6clique=hdfs:///tmp/graph/dataset/sgmQueryGraph/6clique.txt +query_6star=hdfs:///tmp/graph/dataset/sgmQueryGraph/6star.txt + +# output data path +uk_2002_community=hdfs:///tmp/graph/result/louvain/no/uk_2002 +arabic_2005_community=hdfs:///tmp/graph/result/louvain/no/arabic_2005 +twitter_community=hdfs:///tmp/graph/result/louvain/no/twitter +graph500_23_community=hdfs:///tmp/graph/result/louvain/no/graph500_23 +graph500_25_community=hdfs:///tmp/graph/result/louvain/no/graph500_25 +graph500_26_community=hdfs:///tmp/graph/result/louvain/no/graph500_26 +output_path_prefix=/tmp/graph/result \ No newline at end of file diff --git a/tools/kal-test/conf/graph/inccc/inccc.yml b/tools/kal-test/conf/graph/inccc/inccc.yml new file mode 100644 index 0000000..1a3aa5e --- /dev/null +++ b/tools/kal-test/conf/graph/inccc/inccc.yml @@ -0,0 +1,13 @@ +inccc: + opt: + graph500_26: + splitGraph: "," + partitions: 240 + + com_Friendster: + splitGraph: "," + partitions: 240 + + webbase_2001: + splitGraph: "," + partitions: 240 diff --git a/tools/kal-test/conf/graph/inccc/inccc_spark.properties b/tools/kal-test/conf/graph/inccc/inccc_spark.properties new file mode 100644 index 0000000..3dad072 --- /dev/null +++ b/tools/kal-test/conf/graph/inccc/inccc_spark.properties @@ -0,0 +1,30 @@ +deployMode=client +numExectuors_graph500_26_aarch64=71 +executorCores_graph500_26_aarch64=4 +executorMemory_graph500_26_aarch64=12G +extraJavaOptions_graph500_26_aarch64=-Xms12G + +numExectuors_graph500_26_x86_64=71 +executorCores_graph500_26_x86_64=4 +executorMemory_graph500_26_x86_64=12G +extraJavaOptions_graph500_26_x86_64=-Xms12G + +numExectuors_com_Friendster_aarch64=71 +executorCores_com_Friendster_aarch64=4 +executorMemory_com_Friendster_aarch64=12G +extraJavaOptions_com_Friendster_aarch64=-Xms12G + +numExectuors_com_Friendster_x86_64=71 +executorCores_com_Friendster_x86_64=4 +executorMemory_com_Friendster_x86_64=12G +extraJavaOptions_com_Friendster_x86_64=-Xms12G + +numExectuors_webbase_2001_aarch64=71 +executorCores_webbase_2001_aarch64=4 +executorMemory_webbase_2001_aarch64=12G +extraJavaOptions_webbase_2001_aarch64=-Xms12G + +numExectuors_webbase_2001_x86_64=71 +executorCores_webbase_2001_x86_64=4 +executorMemory_webbase_2001_x86_64=12G +extraJavaOptions_webbase_2001_x86_64=-Xms12G \ No newline at end of file diff --git a/tools/kal-test/conf/graph/incpr/incpr.yml b/tools/kal-test/conf/graph/incpr/incpr.yml new file mode 100644 index 0000000..f9dd055 --- /dev/null +++ b/tools/kal-test/conf/graph/incpr/incpr.yml @@ -0,0 +1,8 @@ +# incPageRank dataset parameters arm + +incpr: + twitter_2010: + splitGraph: "\t" + numIter: 100 + resetProb: 0.15 + partNum: 273 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/incpr/incpr_spark.properties b/tools/kal-test/conf/graph/incpr/incpr_spark.properties new file mode 100644 index 0000000..9b95003 --- /dev/null +++ b/tools/kal-test/conf/graph/incpr/incpr_spark.properties @@ -0,0 +1,10 @@ +# Spark parameters +master=yarn +deployMode=client +driverCores=50 +driverMemory=80G +numExectuors=39 +executorCores=7 +executorMemory=23G +execMemOverhead=2G +extraJavaOptions=-Xms23g diff --git a/tools/kal-test/conf/graph/katz/katz.yml b/tools/kal-test/conf/graph/katz/katz.yml new file mode 100644 index 0000000..1826ee3 --- /dev/null +++ b/tools/kal-test/conf/graph/katz/katz.yml @@ -0,0 +1,26 @@ +katz: + opt: + cit_patents: + splitGraph: "," + partitions: 40 + isWeight: false + tol: 1e-7 + maxIter: 100000 + normalized: true + + uk_2002: + splitGraph: "\t" + partitions: 245 + isWeight: false + tol: 1e-7 + maxIter: 100000 + normalized: true + + arabic_2005: + splitGraph: "\t" + partitions: 245 + isWeight: false + tol: 1e-7 + maxIter: 100000 + normalized: true + diff --git a/tools/kal-test/conf/graph/katz/katz_spark.properties b/tools/kal-test/conf/graph/katz/katz_spark.properties new file mode 100644 index 0000000..e5aa2d1 --- /dev/null +++ b/tools/kal-test/conf/graph/katz/katz_spark.properties @@ -0,0 +1,30 @@ +deployMode=client +numExectuors_cit_patents_aarch64=23 +executorCores_cit_patents_aarch64=12 +executorMemory_cit_patents_aarch64=38G +extraJavaOptions_cit_patents_aarch64=-Xms38G + +numExectuors_cit_patents_x86_64=23 +executorCores_cit_patents_x86_64=12 +executorMemory_cit_patents_x86_64=38G +extraJavaOptions_cit_patents_x86_64=-Xms38G + +numExectuors_arabic_2005_aarch64=35 +executorCores_arabic_2005_aarch64=8 +executorMemory_arabic_2005_aarch64=25G +extraJavaOptions_arabic_2005_aarch64=-Xms25G + +numExectuors_arabic_2005_x86_64=35 +executorCores_arabic_2005_x86_64=8 +executorMemory_arabic_2005_x86_64=25G +extraJavaOptions_arabic_2005_x86_64=-Xms25G + +numExectuors_uk_2002_aarch64=35 +executorCores_uk_2002_aarch64=8 +executorMemory_uk_2002_aarch64=25G +extraJavaOptions_uk_2002_aarch64=-Xms25G + +numExectuors_uk_2002_x86_64=35 +executorCores_uk_2002_x86_64=8 +executorMemory_uk_2002_x86_64=25G +extraJavaOptions_uk_2002_x86_64=-Xms25G \ No newline at end of file diff --git a/tools/kal-test/conf/graph/kcore/kcore.yml b/tools/kal-test/conf/graph/kcore/kcore.yml new file mode 100644 index 0000000..2ee40af --- /dev/null +++ b/tools/kal-test/conf/graph/kcore/kcore.yml @@ -0,0 +1,25 @@ +partition: + graph500_22_x86_64_yes: 216 + graph500_23_x86_64_yes: 216 + graph500_25_x86_64_yes: 236 + graph500_26_x86_64_yes: 236 + + graph500_22_aarch64_no: 240 + graph500_23_aarch64_no: 240 + graph500_25_aarch64_no: 552 + graph500_26_aarch64_no: 280 + + graph500_22_x86_64_no: 228 + graph500_23_x86_64_no: 228 + graph500_25_x86_64_no: 232 + graph500_26_x86_64_no: 232 +split: + graph500_22: " " + graph500_23: " " + graph500_25: " " + graph500_26: " " +iterNum: + graph500_22: 28 + graph500_23: 44 + graph500_25: 85 + graph500_26: 38 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/kcore/kcore_spark.properties b/tools/kal-test/conf/graph/kcore/kcore_spark.properties new file mode 100644 index 0000000..c3d8cd6 --- /dev/null +++ b/tools/kal-test/conf/graph/kcore/kcore_spark.properties @@ -0,0 +1,47 @@ +# opt +numExecutors_graph500_22_aarch64=12 +numExecutors_graph500_23_aarch64=12 +numExecutors_graph500_25_aarch64=23 +numExecutors_graph500_26_aarch64=35 +executorMemory_graph500_22_aarch64=78g +executorMemory_graph500_23_aarch64=78g +executorMemory_graph500_25_aarch64=38g +executorMemory_graph500_26_aarch64=25g +executorCores_graph500_22_aarch64=20 +executorCores_graph500_23_aarch64=20 +executorCores_graph500_25_aarch64=12 +executorCores_graph500_26_aarch64=8 +executorExtraJavaopts_graph500_22_aarch64="-Xms78g" +executorExtraJavaopts_graph500_23_aarch64="-Xms78g" +executorExtraJavaopts_graph500_25_aarch64="-Xms38g" +executorExtraJavaopts_graph500_26_aarch64="-Xms25g" +executorMemoryOverhead_graph500_22_aarch64=7985 +executorMemoryOverhead_graph500_23_aarch64=7985 +executorMemoryOverhead_graph500_25_aarch64=3892 +executorMemoryOverhead_graph500_26_aarch64=4096 + +# raw +numExecutors_graph500_22_x86_64=18 +numExecutors_graph500_23_x86_64=18 +numExecutors_graph500_25_x86_64=59 +numExecutors_graph500_26_x86_64=59 +executorMemory_graph500_22_x86_64=52g +executorMemory_graph500_23_x86_64=52g +executorMemory_graph500_25_x86_64=15g +executorMemory_graph500_26_x86_64=15g +executorCores_graph500_22_x86_64=12 +executorCores_graph500_23_x86_64=12 +executorCores_graph500_25_x86_64=4 +executorCores_graph500_26_x86_64=4 +executorExtraJavaopts_graph500_22_x86_64="-Xms52g" +executorExtraJavaopts_graph500_23_x86_64="-Xms52g" +executorExtraJavaopts_graph500_25_x86_64="-Xms15g" +executorExtraJavaopts_graph500_26_x86_64="-Xms15g" +executorMemoryOverhead_graph500_22_x86_64=5325 +executorMemoryOverhead_graph500_23_x86_64=5325 +executorMemoryOverhead_graph500_25_x86_64=1536 +executorMemoryOverhead_graph500_26_x86_64=4096 + +master=yarn +deployMode=client +driverMemory=200g \ No newline at end of file diff --git a/tools/kal-test/conf/graph/louvain/louvain.yml b/tools/kal-test/conf/graph/louvain/louvain.yml new file mode 100644 index 0000000..8dc2306 --- /dev/null +++ b/tools/kal-test/conf/graph/louvain/louvain.yml @@ -0,0 +1,32 @@ +# Louvain dataset parameters + +louvain: + graph500_22: + splitGraph: " " + maxIterations: 20 + isDirected: false + + graph500_24: + splitGraph: " " + maxIterations: 20 + isDirected: false + + graph500_25: + splitGraph: " " + maxIterations: 20 + isDirected: false + + cit_patents: + splitGraph: "\t" + maxIterations: 20 + isDirected: true + + uk_2002: + splitGraph: "\t" + maxIterations: 20 + isDirected: true + + arabic_2005: + splitGraph: "\t" + maxIterations: 20 + isDirected: true \ No newline at end of file diff --git a/tools/kal-test/conf/graph/louvain/louvain_spark.properties b/tools/kal-test/conf/graph/louvain/louvain_spark.properties new file mode 100644 index 0000000..082d0be --- /dev/null +++ b/tools/kal-test/conf/graph/louvain/louvain_spark.properties @@ -0,0 +1,74 @@ +# Spark parameters +deployMode=client + +graph500_22_numExecutors_aarch64=35 +graph500_22_executorCores_aarch64=8 +graph500_22_executorMemory_aarch64=25G +graph500_22_extraJavaOptions_aarch64=-Xms25G +graph500_22_numPartitions_aarch64=280 + +graph500_24_numExecutors_aarch64=71 +graph500_24_executorCores_aarch64=4 +graph500_24_executorMemory_aarch64=12G +graph500_24_extraJavaOptions_aarch64=-Xms12G +graph500_24_numPartitions_aarch64=284 + +graph500_25_numExecutors_aarch64=71 +graph500_25_executorCores_aarch64=4 +graph500_25_executorMemory_aarch64=12G +graph500_25_extraJavaOptions_aarch64=-Xms12G +graph500_25_numPartitions_aarch64=568 + +cit_patents_numExecutors_aarch64=35 +cit_patents_executorCores_aarch64=8 +cit_patents_executorMemory_aarch64=25G +cit_patents_extraJavaOptions_aarch64=-Xms25G +cit_patents_numPartitions_aarch64=280 + +uk_2002_numExecutors_aarch64=71 +uk_2002_executorCores_aarch64=4 +uk_2002_executorMemory_aarch64=12G +uk_2002_extraJavaOptions_aarch64=-Xms12G +uk_2002_numPartitions_aarch64=284 + +arabic_2005_numExecutors_aarch64=71 +arabic_2005_executorCores_aarch64=4 +arabic_2005_executorMemory_aarch64=12G +arabic_2005_extraJavaOptions_aarch64=-Xms12G +arabic_2005_numPartitions_aarch64=284 + +graph500_22_numExecutors_x86_64=29 +graph500_22_executorCores_x86_64=8 +graph500_22_executorMemory_x86_64=31G +graph500_22_extraJavaOptions_x86_64=-Xms31G +graph500_22_numPartitions_x86_64=232 + +graph500_24_numExecutors_x86_64=59 +graph500_24_executorCores_x86_64=4 +graph500_24_executorMemory_x86_64=15G +graph500_24_extraJavaOptions_x86_64=-Xms15G +graph500_24_numPartitions_x86_64=236 + +graph500_25_numExecutors_x86_64=59 +graph500_25_executorCores_x86_64=4 +graph500_25_executorMemory_x86_64=15G +graph500_25_extraJavaOptions_x86_64=-Xms15G +graph500_25_numPartitions_x86_64=472 + +cit_patents_numExecutors_x86_64=29 +cit_patents_executorCores_x86_64=8 +cit_patents_executorMemory_x86_64=31G +cit_patents_extraJavaOptions_x86_64=-Xms31G +cit_patents_numPartitions_x86_64=232 + +uk_2002_numExecutors_x86_64=59 +uk_2002_executorCores_x86_64=4 +uk_2002_executorMemory_x86_64=15G +uk_2002_extraJavaOptions_x86_64=-Xms15G +uk_2002_numPartitions_x86_64=236 + +arabic_2005_numExecutors_x86_64=59 +arabic_2005_executorCores_x86_64=4 +arabic_2005_executorMemory_x86_64=15G +arabic_2005_extraJavaOptions_x86_64=-Xms15g +arabic_2005_numPartitions_x86_64=236 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/lpa/lpa.yml b/tools/kal-test/conf/graph/lpa/lpa.yml new file mode 100644 index 0000000..8f8a206 --- /dev/null +++ b/tools/kal-test/conf/graph/lpa/lpa.yml @@ -0,0 +1,9 @@ +partition: + graph500_22_x86_64: 472 + graph500_22_aarch64: 282 + graph500_24_x86_64: 702 + graph500_24_aarch64: 282 + graph500_25_x86_64: 472 + graph500_25_aarch64: 282 +split: " " +maxSteps: 10 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/lpa/lpa_spark.properties b/tools/kal-test/conf/graph/lpa/lpa_spark.properties new file mode 100644 index 0000000..f66bcf8 --- /dev/null +++ b/tools/kal-test/conf/graph/lpa/lpa_spark.properties @@ -0,0 +1,33 @@ +numExecutors_graph500_22_aarch64=47 +executorMemory_graph500_22_aarch64=19g +executorCores_graph500_22_aarch64=6 +executorExtraJavaopts_graph500_22_aarch64="-Xms19g" + +numExecutors_graph500_24_aarch64=47 +executorMemory_graph500_24_aarch64=19g +executorCores_graph500_24_aarch64=6 +executorExtraJavaopts_graph500_24_aarch64="-Xms19g" + +numExecutors_graph500_25_aarch64=47 +executorMemory_graph500_25_aarch64=19g +executorCores_graph500_25_aarch64=6 +executorExtraJavaopts_graph500_25_aarch64="-Xms19g" + +numExecutors_graph500_22_x86_64=59 +executorMemory_graph500_22_x86_64=15g +executorCores_graph500_22_x86_64=4 +executorExtraJavaopts_graph500_22_x86_64="-Xms15g" + +numExecutors_graph500_24_x86_64=78 +executorMemory_graph500_24_x86_64=12g +executorCores_graph500_24_x86_64=3 +executorExtraJavaopts_graph500_24_x86_64="-Xms12g" + +numExecutors_graph500_25_x86_64=59 +executorMemory_graph500_25_x86_64=15g +executorCores_graph500_25_x86_64=4 +executorExtraJavaopts_graph500_25_x86_64="-Xms15g" + +master=yarn +deployMode=client +driverMemory=300g \ No newline at end of file diff --git a/tools/kal-test/conf/graph/mce/mce.yml b/tools/kal-test/conf/graph/mce/mce.yml new file mode 100644 index 0000000..6ea8fef --- /dev/null +++ b/tools/kal-test/conf/graph/mce/mce.yml @@ -0,0 +1,10 @@ +partition: + graph500_23: 284 + graph500_24: 284 + graph500_25: 568 +minK: 3 +maxDegree: 2000 +split: + graph500_23: " " + graph500_24: " " + graph500_25: " " diff --git a/tools/kal-test/conf/graph/mce/mce_spark.properties b/tools/kal-test/conf/graph/mce/mce_spark.properties new file mode 100644 index 0000000..625f037 --- /dev/null +++ b/tools/kal-test/conf/graph/mce/mce_spark.properties @@ -0,0 +1,9 @@ +numExecutors_aarch64=71 +numExecutors_x86_64=59 +executorCores=4 +executorMemory_aarch64=13g +executorMemory_x86_64=15g +extraJavaOptions_aarch64="-Xms13g" +extraJavaOptions_x86_64="-Xms15g" +deployMode=client +driverMemory=80g diff --git a/tools/kal-test/conf/graph/modularity/modularity.yml b/tools/kal-test/conf/graph/modularity/modularity.yml new file mode 100644 index 0000000..50400ec --- /dev/null +++ b/tools/kal-test/conf/graph/modularity/modularity.yml @@ -0,0 +1,44 @@ +# WCE dataset parameters + +modularity: + graph500_23: + splitGraph: " " + splitCommunity: " " + isWeighted: false + isDirected: false + numPartitions: 200 + + graph500_25: + splitGraph: " " + splitCommunity: "," + isWeighted: false + isDirected: false + numPartitions: 500 + + graph500_26: + splitGraph: " " + splitCommunity: " " + isWeighted: false + isDirected: false + numPartitions: 500 + + uk_2002: + splitGraph: "\t" + splitCommunity: "," + isWeighted: false + isDirected: true + numPartitions: 500 + + arabic_2005: + splitGraph: "\t" + splitCommunity: "," + isWeighted: false + isDirected: true + numPartitions: 500 + + twitter: + splitGraph: "\t" + splitCommunity: "," + isWeighted: false + isDirected: true + numPartitions: 1000 diff --git a/tools/kal-test/conf/graph/modularity/modularity_spark.properties b/tools/kal-test/conf/graph/modularity/modularity_spark.properties new file mode 100644 index 0000000..bad2d1c --- /dev/null +++ b/tools/kal-test/conf/graph/modularity/modularity_spark.properties @@ -0,0 +1,63 @@ +# Spark parameters +# opt +graph500_23_numExectuors_aarch64=35 +graph500_23_executorCores_aarch64=8 +graph500_23_executorMemory_aarch64=25G +graph500_23_extraJavaOptions_aarch64=-Xms25g + +graph500_25_numExectuors_aarch64=35 +graph500_25_executorCores_aarch64=8 +graph500_25_executorMemory_aarch64=25G +graph500_25_extraJavaOptions_aarch64=-Xms25g + +graph500_26_numExectuors_aarch64=90 +graph500_26_executorCores_aarch64=3 +graph500_26_executorMemory_aarch64=11G +graph500_26_extraJavaOptions_aarch64=-Xms11g + +uk_numExectuors_aarch64=47 +uk_executorCores_aarch64=6 +uk_executorMemory_aarch64=19G +uk_extraJavaOptions_aarch64=-Xms19g + +arabic_2005_numExectuors_aarch64=47 +arabic_2005_executorCores_aarch64=6 +arabic_2005_executorMemory_aarch64=19G +arabic_2005_extraJavaOptions_aarch64=-Xms19g + +twitter_numExectuors_aarch64=95 +twitter_executorCores_aarch64=3 +twitter_executorMemory_aarch64=9G +twitter_extraJavaOptions_aarch64=-Xms9g + +uk_numExectuors_x86_64=33 +uk_executorCores_x86_64=7 +uk_executorMemory_x86_64=28G +uk_extraJavaOptions_x86_64=-Xms28g + +arabic_2005_numExectuors_x86_64=47 +arabic_2005_executorCores_x86_64=5 +arabic_2005_executorMemory_x86_64=19G +arabic_2005_extraJavaOptions_x86_64=-Xms19g + +twitter_numExectuors_x86_64=78 +twitter_executorCores_x86_64=3 +twitter_executorMemory_x86_64=11G +twitter_extraJavaOptions_x86_64=-Xms11g + +# raw +graph500_23_numExectuors_x86_64=95 +graph500_23_executorCores_x86_64=3 +graph500_23_executorMemory_x86_64=9G +graph500_23_extraJavaOptions_x86_64=-Xms9g + +graph500_25_numExectuors_x86_64=95 +graph500_25_executorCores_x86_64=3 +graph500_25_executorMemory_x86_64=9G +graph500_25_extraJavaOptions_x86_64=-Xms9g + +graph500_26_numExectuors_x86_64=95 +graph500_26_executorCores_x86_64=3 +graph500_26_executorMemory_x86_64=9G +graph500_26_extraJavaOptions_x86_64=-Xms9g + diff --git a/tools/kal-test/conf/graph/mssp/mssp_spark.properties b/tools/kal-test/conf/graph/mssp/mssp_spark.properties new file mode 100644 index 0000000..aa3a42f --- /dev/null +++ b/tools/kal-test/conf/graph/mssp/mssp_spark.properties @@ -0,0 +1,9 @@ +# Spark parameters +splitGraph="\t" + +numExectuors=35 +executorCores=8 +executorMemory=25g +driverMemory=200g +extraJavaOptions=-Xms25g +computePartition=280 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/node2vec/node2vec.yml b/tools/kal-test/conf/graph/node2vec/node2vec.yml new file mode 100644 index 0000000..a67c42c --- /dev/null +++ b/tools/kal-test/conf/graph/node2vec/node2vec.yml @@ -0,0 +1,41 @@ +# Node2vec dataset parameters arm + +node2vec: + cit_patents: + partitions: 240 + directed: true + weighted: false + walkLength: 20 + numWalks: 5 + p: 1.0 + q: 1.0 + iteration: 10 + dimension: 128 + windowSize: 20 + splitGraph: "\t" + + soc_liveJournal: + partitions: 240 + directed: true + weighted: false + walkLength: 5 + numWalks: 5 + p: 1.0 + q: 1.0 + iteration: 10 + dimension: 128 + windowSize: 20 + splitGraph: "\t" + + uk_2002: + partitions: 240 + directed: true + weighted: false + walkLength: 5 + numWalks: 1 + p: 1.0 + q: 1.0 + iteration: 10 + dimension: 128 + windowSize: 20 + splitGraph: "\t" \ No newline at end of file diff --git a/tools/kal-test/conf/graph/node2vec/node2vec_spark.properties b/tools/kal-test/conf/graph/node2vec/node2vec_spark.properties new file mode 100644 index 0000000..6a08e00 --- /dev/null +++ b/tools/kal-test/conf/graph/node2vec/node2vec_spark.properties @@ -0,0 +1,14 @@ +# Spark parameters +master=yarn +deployMode=client +driverMemory=300G + +numExecutors_aarch64=3 +executorMemory_aarch64=315g +driverCores_aarch64=61 +executorCores_aarch64=63 + +numExecutors_x86_64=21 +executorCores_x86_64=8 +executorMemory_x86_64=43G +driverCores_x86_64=75G diff --git a/tools/kal-test/conf/graph/ppr/ppr.yml b/tools/kal-test/conf/graph/ppr/ppr.yml new file mode 100644 index 0000000..33d4e62 --- /dev/null +++ b/tools/kal-test/conf/graph/ppr/ppr.yml @@ -0,0 +1,23 @@ +# PersonalizedPageRank dataset parameters arm + +ppr: + cit_patents: + splitGraph: "\t" + numIter: 100 + resetProb: 0.15 + tolerance: 1e-7 + sourcesPath: hdfs:///tmp/graph/dataset/ppr_sources/cit_patents + + uk_2002: + splitGraph: "\t" + numIter: 100 + resetProb: 0.15 + tolerance: 1e-7 + sourcesPath: hdfs:///tmp/graph/dataset/ppr_sources/uk_2002 + + arabic_2005: + splitGraph: "\t" + numIter: 100 + resetProb: 0.15 + tolerance: 1e-7 + sourcesPath: hdfs:///tmp/graph/dataset/ppr_sources/arabic_2005 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/ppr/ppr_source_id.properties b/tools/kal-test/conf/graph/ppr/ppr_source_id.properties new file mode 100644 index 0000000..361b1f6 --- /dev/null +++ b/tools/kal-test/conf/graph/ppr/ppr_source_id.properties @@ -0,0 +1,7 @@ +cit_patents_SourceID=5622600 +uk_2002_SourceID=16450200 +arabic_2005_SourceID=278400 + +#cit_patents_SourceID=5622600,5481601,5675700,5900101,4930200,4956300,5861707,5877028,5573854,5901425,5837429 +#uk_2002_SourceID=16450200,13157400,1074900,3930300,17115000,11099400,4335300,10651502,9378301,1895701,4714502,14070301,4997400 +#arabic_2005_SourceID=278400,4056900,10197600,19206300,7866300,11463600,7872900,12360300,1567500,13875000,16428900,17001300,8457300,3451800,10769700,20033100,20018700,16322400,5136900,6093600 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/ppr/ppr_spark.properties b/tools/kal-test/conf/graph/ppr/ppr_spark.properties new file mode 100644 index 0000000..6e0dd3c --- /dev/null +++ b/tools/kal-test/conf/graph/ppr/ppr_spark.properties @@ -0,0 +1,111 @@ +# Spark parameters +driverMemory=80g +deployMode=client + +fixMS_cit_patents_numExecutors_aarch64=12 +fixMS_cit_patents_executorCores_aarch64=23 +fixMS_cit_patents_executorMemory_aarch64=79G +fixMS_cit_patents_extraJavaOptions_aarch64=-Xms79G +fixMS_cit_patents_numPartitions_aarch64=72 + +fixMS_uk_2002_numExecutors_aarch64=12 +fixMS_uk_2002_executorCores_aarch64=23 +fixMS_uk_2002_executorMemory_aarch64=79G +fixMS_uk_2002_extraJavaOptions_aarch64=-Xms79G +fixMS_uk_2002_numPartitions_aarch64=102 + +fixMS_arabic_2005_numExecutors_aarch64=12 +fixMS_arabic_2005_executorCores_aarch64=23 +fixMS_arabic_2005_executorMemory_aarch64=79G +fixMS_arabic_2005_extraJavaOptions_aarch64=-Xms79G +fixMS_arabic_2005_numPartitions_aarch64=102 + +fixSS_cit_patents_numExecutors_aarch64=12 +fixSS_cit_patents_executorCores_aarch64=23 +fixSS_cit_patents_executorMemory_aarch64=79G +fixSS_cit_patents_extraJavaOptions_aarch64=-Xms79G +fixSS_cit_patents_numPartitions_aarch64=72 + +fixSS_uk_2002_numExecutors_aarch64=12 +fixSS_uk_2002_executorCores_aarch64=23 +fixSS_uk_2002_executorMemory_aarch64=79G +fixSS_uk_2002_extraJavaOptions_aarch64=-Xms79G +fixSS_uk_2002_numPartitions_aarch64=72 + +fixSS_arabic_2005_numExecutors_aarch64=36 +fixSS_arabic_2005_executorCores_aarch64=7 +fixSS_arabic_2005_executorMemory_aarch64=26G +fixSS_arabic_2005_extraJavaOptions_aarch64=-Xms26G +fixSS_arabic_2005_numPartitions_aarch64=120 + +conSS_cit_patents_numExecutors_aarch64=12 +conSS_cit_patents_executorCores_aarch64=23 +conSS_cit_patents_executorMemory_aarch64=79G +conSS_cit_patents_extraJavaOptions_aarch64=-Xms79G +conSS_cit_patents_numPartitions_aarch64=72 + +conSS_uk_2002_numExecutors_aarch64=12 +conSS_uk_2002_executorCores_aarch64=23 +conSS_uk_2002_executorMemory_aarch64=79G +conSS_uk_2002_extraJavaOptions_aarch64=-Xms79G +conSS_uk_2002_numPartitions_aarch64=72 + +conSS_arabic_2005_numExecutors_aarch64=36 +conSS_arabic_2005_executorCores_aarch64=7 +conSS_arabic_2005_executorMemory_aarch64=26G +conSS_arabic_2005_extraJavaOptions_aarch64=-Xms26G +conSS_arabic_2005_numPartitions_aarch64=120 + +fixMS_cit_patents_numExecutors_x86_64=12 +fixMS_cit_patents_executorCores_x86_64=19 +fixMS_cit_patents_executorMemory_x86_64=79G +fixMS_cit_patents_extraJavaOptions_x86_64=-Xms79G +fixMS_cit_patents_numPartitions_x86_64=72 + +fixMS_uk_2002_numExecutors_x86_64=12 +fixMS_uk_2002_executorCores_x86_64=19 +fixMS_uk_2002_executorMemory_x86_64=79G +fixMS_uk_2002_extraJavaOptions_x86_64=-Xms79G +fixMS_uk_2002_numPartitions_x86_64=102 + +fixMS_arabic_2005_numExecutors_x86_64=12 +fixMS_arabic_2005_executorCores_x86_64=19 +fixMS_arabic_2005_executorMemory_x86_64=79G +fixMS_arabic_2005_extraJavaOptions_x86_64=-Xms79G +fixMS_arabic_2005_numPartitions_x86_64=102 + +fixSS_cit_patents_numExecutors_x86_64=12 +fixSS_cit_patents_executorCores_x86_64=19 +fixSS_cit_patents_executorMemory_x86_64=79G +fixSS_cit_patents_extraJavaOptions_x86_64=-Xms79G +fixSS_cit_patents_numPartitions_x86_64=72 + +fixSS_uk_2002_numExecutors_x86_64=12 +fixSS_uk_2002_executorCores_x86_64=19 +fixSS_uk_2002_executorMemory_x86_64=79G +fixSS_uk_2002_extraJavaOptions_x86_64=-Xms79G +fixSS_uk_2002_numPartitions_x86_64=102 + +fixSS_arabic_2005_numExecutors_x86_64=12 +fixSS_arabic_2005_executorCores_x86_64=19 +fixSS_arabic_2005_executorMemory_x86_64=79G +fixSS_arabic_2005_extraJavaOptions_x86_64=-Xms79G +fixSS_arabic_2005_numPartitions_x86_64=102 + +conSS_cit_patents_numExecutors_x86_64=12 +conSS_cit_patents_executorCores_x86_64=19 +conSS_cit_patents_executorMemory_x86_64=79G +conSS_cit_patents_extraJavaOptions_x86_64=-Xms79G +conSS_cit_patents_numPartitions_x86_64=72 + +conSS_uk_2002_numExecutors_x86_64=12 +conSS_uk_2002_executorCores_x86_64=19 +conSS_uk_2002_executorMemory_x86_64=79G +conSS_uk_2002_extraJavaOptions_x86_64=-Xms79G +conSS_uk_2002_numPartitions_x86_64=102 + +conSS_arabic_2005_numExecutors_x86_64=12 +conSS_arabic_2005_executorCores_x86_64=19 +conSS_arabic_2005_executorMemory_x86_64=79G +conSS_arabic_2005_extraJavaOptions_x86_64=-Xms79G +conSS_arabic_2005_numPartitions_x86_64=102 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/pr/pr.yml b/tools/kal-test/conf/graph/pr/pr.yml new file mode 100644 index 0000000..acc1010 --- /dev/null +++ b/tools/kal-test/conf/graph/pr/pr.yml @@ -0,0 +1,20 @@ +# PageRank dataset parameters arm + +pr: + cit_patents: + splitGraph: "\t" + numIter: 100 + resetProb: 0.15 + tolerance: 1e-7 + + uk_2002: + splitGraph: "\t" + numIter: 100 + resetProb: 0.15 + tolerance: 1e-7 + + arabic_2005: + splitGraph: "\t" + numIter: 100 + resetProb: 0.15 + tolerance: 1e-7 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/pr/pr_spark.properties b/tools/kal-test/conf/graph/pr/pr_spark.properties new file mode 100644 index 0000000..abdbdc6 --- /dev/null +++ b/tools/kal-test/conf/graph/pr/pr_spark.properties @@ -0,0 +1,74 @@ +# Spark parameters +deployMode=client + +run_cit_patents_numExecutors_aarch64=39 +run_cit_patents_executorCores_aarch64=7 +run_cit_patents_executorMemory_aarch64=26G +run_cit_patents_extraJavaOptions_aarch64=-Xms26G +run_cit_patents_numPartitions_aarch64=240 + +run_uk_2002_numExecutors_aarch64=54 +run_uk_2002_executorCores_aarch64=5 +run_uk_2002_executorMemory_aarch64=19G +run_uk_2002_extraJavaOptions_aarch64=-Xms19G +run_uk_2002_numPartitions_aarch64=180 + +run_arabic_2005_numExecutors_aarch64=36 +run_arabic_2005_executorCores_aarch64=7 +run_arabic_2005_executorMemory_aarch64=28G +run_arabic_2005_extraJavaOptions_aarch64=-Xms28G +run_arabic_2005_numPartitions_aarch64=180 + +convergence_cit_patents_numExecutors_aarch64=12 +convergence_cit_patents_executorCores_aarch64=23 +convergence_cit_patents_executorMemory_aarch64=79G +convergence_cit_patents_extraJavaOptions_aarch64=-Xms79G +convergence_cit_patents_numPartitions_aarch64=72 + +convergence_uk_2002_numExecutors_aarch64=12 +convergence_uk_2002_executorCores_aarch64=23 +convergence_uk_2002_executorMemory_aarch64=79G +convergence_uk_2002_extraJavaOptions_aarch64=-Xms79G +convergence_uk_2002_numPartitions_aarch64=72 + +convergence_arabic_2005_numExecutors_aarch64=36 +convergence_arabic_2005_executorCores_aarch64=7 +convergence_arabic_2005_executorMemory_aarch64=26G +convergence_arabic_2005_extraJavaOptions_aarch64=-Xms26G +convergence_arabic_2005_numPartitions_aarch64=120 + +run_cit_patents_numExecutors_x86_64=12 +run_cit_patents_executorCores_x86_64=19 +run_cit_patents_executorMemory_x86_64=79G +run_cit_patents_extraJavaOptions_x86_64=-Xms79G +run_cit_patents_numPartitions_x86_64=72 + +run_uk_2002_numExecutors_x86_64=12 +run_uk_2002_executorCores_x86_64=19 +run_uk_2002_executorMemory_x86_64=79G +run_uk_2002_extraJavaOptions_x86_64=-Xms79G +run_uk_2002_numPartitions_x86_64=102 + +run_arabic_2005_numExecutors_x86_64=12 +run_arabic_2005_executorCores_x86_64=19 +run_arabic_2005_executorMemory_x86_64=79G +run_arabic_2005_extraJavaOptions_x86_64=-Xms79g +run_arabic_2005_numPartitions_x86_64=102 + +convergence_cit_patents_numExecutors_x86_64=12 +convergence_cit_patents_executorCores_x86_64=19 +convergence_cit_patents_executorMemory_x86_64=79G +convergence_cit_patents_extraJavaOptions_x86_64=-Xms79g +convergence_cit_patents_numPartitions_x86_64=72 + +convergence_uk_2002_numExecutors_x86_64=12 +convergence_uk_2002_executorCores_x86_64=19 +convergence_uk_2002_executorMemory_x86_64=79G +convergence_uk_2002_extraJavaOptions_x86_64=-Xms79g +convergence_uk_2002_numPartitions_x86_64=102 + +convergence_arabic_2005_numExecutors_x86_64=12 +convergence_arabic_2005_executorCores_x86_64=19 +convergence_arabic_2005_executorMemory_x86_64=79G +convergence_arabic_2005_extraJavaOptions_x86_64=-Xms79g +convergence_arabic_2005_numPartitions_x86_64=102 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/scc/scc.yml b/tools/kal-test/conf/graph/scc/scc.yml new file mode 100644 index 0000000..1ecca96 --- /dev/null +++ b/tools/kal-test/conf/graph/scc/scc.yml @@ -0,0 +1,4 @@ +split: + cit_patents: "\t" + enwiki_2018: "\t" + arabic_2005: "\t" \ No newline at end of file diff --git a/tools/kal-test/conf/graph/scc/scc_spark.properties b/tools/kal-test/conf/graph/scc/scc_spark.properties new file mode 100644 index 0000000..7ea3e02 --- /dev/null +++ b/tools/kal-test/conf/graph/scc/scc_spark.properties @@ -0,0 +1,20 @@ +executorMemory_aarch64=12g +executorCores_aarch64=4 +numExecutors_aarch64=71 + +executorMemory_aarch64_arabic_2005=25g +executorCores_aarch64_arabic_2005=8 +numExecutors_aarch64_arabic_2005=35 + +executorMemory_x86_64=15g +executorCores_x86_64=4 +numExecutors_x86_64=59 + +executorMemory_x86_64_arabic_2005=23g +executorCores_x86_64_arabic_2005=6 +numExecutors_x86_64_arabic_2005=39 + +master=yarn +deployMode=client +driverMemory=100g +executor_extra_javaopts="-Xms${!executor_memory_val} -XX:hashCode=0" diff --git a/tools/kal-test/conf/graph/sgm/sgm.yml b/tools/kal-test/conf/graph/sgm/sgm.yml new file mode 100644 index 0000000..be8a81d --- /dev/null +++ b/tools/kal-test/conf/graph/sgm/sgm.yml @@ -0,0 +1,6 @@ +splitDataGraph: + graph500_19: "," + liveJournal: "\t" + com_orkut: "\t" +splitQueryGraph: "," +resultNum: 10000 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/sgm/sgm_spark.properties b/tools/kal-test/conf/graph/sgm/sgm_spark.properties new file mode 100644 index 0000000..4555047 --- /dev/null +++ b/tools/kal-test/conf/graph/sgm/sgm_spark.properties @@ -0,0 +1,367 @@ +# Spark parameters +deployMode=client +driverMemory=16g +rpcAskTime=36000 +schedulerMaxRegisteredResourcesWaitingTime=3600000 +workerTimeout=3600 +networkTimeout=6000s +storageBlockManagerSlaveTimeoutMs=600000 +shuffleBlockTransferService=nio +driverMaxResultSize=200g +shuffleManager=SORT +broadcastBlockSize=25g +rpcMessageMaxSize=2046 +coreConnectionAckWaitTimeout=60000s +storageMemoryFraction=0.2 +shuffleMemoryFraction=0.6 +rddCompress=true +memoryUseLegacyMode=true + +# opt +graph500_19_4dgn_unIdentical_numberTask_aarch64=1500 +graph500_19_4dgn_unIdentical_numExecutors_aarch64=35 +graph500_19_4dgn_unIdentical_executorCores_aarch64=8 +graph500_19_4dgn_unIdentical_executorMemory_aarch64=25g +graph500_19_4dgn_unIdentical_numPartitions_aarch64=280 +graph500_19_4dgn_unIdentical_executorExtraJavaOptions_aarch64=-Xms25g + +graph500_19_4clique_unIdentical_numberTask_aarch64=500 +graph500_19_4clique_unIdentical_numExecutors_aarch64=35 +graph500_19_4clique_unIdentical_executorCores_aarch64=8 +graph500_19_4clique_unIdentical_executorMemory_aarch64=25g +graph500_19_4clique_unIdentical_numPartitions_aarch64=280 +graph500_19_4clique_unIdentical_executorExtraJavaOptions_aarch64=-Xms25g + +graph500_19_5clique_unIdentical_numberTask_aarch64=500 +graph500_19_5clique_unIdentical_numExecutors_aarch64=35 +graph500_19_5clique_unIdentical_executorCores_aarch64=8 +graph500_19_5clique_unIdentical_executorMemory_aarch64=25g +graph500_19_5clique_unIdentical_numPartitions_aarch64=280 +graph500_19_5clique_unIdentical_executorExtraJavaOptions_aarch64=-Xms25g + +graph500_19_6clique_unIdentical_numberTask_aarch64=1000 +graph500_19_6clique_unIdentical_numExecutors_aarch64=47 +graph500_19_6clique_unIdentical_executorCores_aarch64=6 +graph500_19_6clique_unIdentical_executorMemory_aarch64=19g +graph500_19_6clique_unIdentical_numPartitions_aarch64=282 +graph500_19_6clique_unIdentical_executorExtraJavaOptions_aarch64=-Xms19g + +liveJournal_4dgn_unIdentical_numberTask_aarch64=1000 +liveJournal_4dgn_unIdentical_numExecutors_aarch64=47 +liveJournal_4dgn_unIdentical_executorCores_aarch64=6 +liveJournal_4dgn_unIdentical_executorMemory_aarch64=19g +liveJournal_4dgn_unIdentical_numPartitions_aarch64=282 +liveJournal_4dgn_unIdentical_executorExtraJavaOptions_aarch64=-Xms19g + +liveJournal_4clique_unIdentical_numberTask_aarch64=500 +liveJournal_4clique_unIdentical_numExecutors_aarch64=35 +liveJournal_4clique_unIdentical_executorCores_aarch64=8 +liveJournal_4clique_unIdentical_executorMemory_aarch64=25g +liveJournal_4clique_unIdentical_numPartitions_aarch64=280 +liveJournal_4clique_unIdentical_executorExtraJavaOptions_aarch64=-Xms25g + +liveJournal_5clique_unIdentical_numberTask_aarch64=1500 +liveJournal_5clique_unIdentical_numExecutors_aarch64=35 +liveJournal_5clique_unIdentical_executorCores_aarch64=8 +liveJournal_5clique_unIdentical_executorMemory_aarch64=25g +liveJournal_5clique_unIdentical_numPartitions_aarch64=280 +liveJournal_5clique_unIdentical_executorExtraJavaOptions_aarch64=-Xms25g + +liveJournal_6clique_unIdentical_numberTask_aarch64=2000 +liveJournal_6clique_unIdentical_numExecutors_aarch64=35 +liveJournal_6clique_unIdentical_executorCores_aarch64=8 +liveJournal_6clique_unIdentical_executorMemory_aarch64=25g +liveJournal_6clique_unIdentical_numPartitions_aarch64=280 +liveJournal_6clique_unIdentical_executorExtraJavaOptions_aarch64=-Xms25g + +com_orkut_4dgn_unIdentical_numberTask_aarch64=1000 +com_orkut_4dgn_unIdentical_numExecutors_aarch64=35 +com_orkut_4dgn_unIdentical_executorCores_aarch64=8 +com_orkut_4dgn_unIdentical_executorMemory_aarch64=25g +com_orkut_4dgn_unIdentical_numPartitions_aarch64=280 +com_orkut_4dgn_unIdentical_executorExtraJavaOptions_aarch64=-Xms25g + +com_orkut_4clique_unIdentical_numberTask_aarch64=500 +com_orkut_4clique_unIdentical_numExecutors_aarch64=35 +com_orkut_4clique_unIdentical_executorCores_aarch64=8 +com_orkut_4clique_unIdentical_executorMemory_aarch64=25g +com_orkut_4clique_unIdentical_numPartitions_aarch64=280 +com_orkut_4clique_unIdentical_executorExtraJavaOptions_aarch64=-Xms25g + +com_orkut_5clique_unIdentical_numberTask_aarch64=500 +com_orkut_5clique_unIdentical_numExecutors_aarch64=35 +com_orkut_5clique_unIdentical_executorCores_aarch64=8 +com_orkut_5clique_unIdentical_executorMemory_aarch64=25g +com_orkut_5clique_unIdentical_numPartitions_aarch64=280 +com_orkut_5clique_unIdentical_executorExtraJavaOptions_aarch64=-Xms25g + +com_orkut_6clique_unIdentical_numberTask_aarch64=500 +com_orkut_6clique_unIdentical_numExecutors_aarch64=47 +com_orkut_6clique_unIdentical_executorCores_aarch64=6 +com_orkut_6clique_unIdentical_executorMemory_aarch64=19g +com_orkut_6clique_unIdentical_numPartitions_aarch64=282 +com_orkut_6clique_unIdentical_executorExtraJavaOptions_aarch64=-Xms19g + +graph500_19_4dgn_Identical_numberTask_aarch64=3000 +graph500_19_4dgn_Identical_numExecutors_aarch64=35 +graph500_19_4dgn_Identical_executorCores_aarch64=8 +graph500_19_4dgn_Identical_executorMemory_aarch64=25g +graph500_19_4dgn_Identical_numPartitions_aarch64=280 +graph500_19_4dgn_Identical_executorExtraJavaOptions_aarch64=-Xms25g + +graph500_19_4sqr_Identical_numberTask_aarch64=1000 +graph500_19_4sqr_Identical_numExecutors_aarch64=35 +graph500_19_4sqr_Identical_executorCores_aarch64=8 +graph500_19_4sqr_Identical_executorMemory_aarch64=25g +graph500_19_4sqr_Identical_numPartitions_aarch64=280 +graph500_19_4sqr_Identical_executorExtraJavaOptions_aarch64=-Xms25g + +graph500_19_5tree_Identical_numberTask_aarch64=500 +graph500_19_5tree_Identical_numExecutors_aarch64=35 +graph500_19_5tree_Identical_executorCores_aarch64=8 +graph500_19_5tree_Identical_executorMemory_aarch64=25g +graph500_19_5tree_Identical_numPartitions_aarch64=280 +graph500_19_5tree_Identical_executorExtraJavaOptions_aarch64=-Xms25g + +graph500_19_6star_Identical_numberTask_aarch64=500 +graph500_19_6star_Identical_numExecutors_aarch64=35 +graph500_19_6star_Identical_executorCores_aarch64=8 +graph500_19_6star_Identical_executorMemory_aarch64=25g +graph500_19_6star_Identical_numPartitions_aarch64=280 +graph500_19_6star_Identical_executorExtraJavaOptions_aarch64=-Xms25g + +liveJournal_4dgn_Identical_numberTask_aarch64=1000 +liveJournal_4dgn_Identical_numExecutors_aarch64=35 +liveJournal_4dgn_Identical_executorCores_aarch64=8 +liveJournal_4dgn_Identical_executorMemory_aarch64=25g +liveJournal_4dgn_Identical_numPartitions_aarch64=280 +liveJournal_4dgn_Identical_executorExtraJavaOptions_aarch64=-Xms25g + +liveJournal_4sqr_Identical_numberTask_aarch64=1000 +liveJournal_4sqr_Identical_numExecutors_aarch64=35 +liveJournal_4sqr_Identical_executorCores_aarch64=8 +liveJournal_4sqr_Identical_executorMemory_aarch64=25g +liveJournal_4sqr_Identical_numPartitions_aarch64=280 +liveJournal_4sqr_Identical_executorExtraJavaOptions_aarch64=-Xms25g + +liveJournal_5tree_Identical_numberTask_aarch64=1000 +liveJournal_5tree_Identical_numExecutors_aarch64=35 +liveJournal_5tree_Identical_executorCores_aarch64=8 +liveJournal_5tree_Identical_executorMemory_aarch64=25g +liveJournal_5tree_Identical_numPartitions_aarch64=280 +liveJournal_5tree_Identical_executorExtraJavaOptions_aarch64=-Xms25g + +liveJournal_6star_Identical_numberTask_aarch64=1000 +liveJournal_6star_Identical_numExecutors_aarch64=35 +liveJournal_6star_Identical_executorCores_aarch64=8 +liveJournal_6star_Identical_executorMemory_aarch64=25g +liveJournal_6star_Identical_numPartitions_aarch64=280 +liveJournal_6star_Identical_executorExtraJavaOptions_aarch64=-Xms25g + +com_orkut_4dgn_Identical_numberTask_aarch64=3000 +com_orkut_4dgn_Identical_numExecutors_aarch64=35 +com_orkut_4dgn_Identical_executorCores_aarch64=8 +com_orkut_4dgn_Identical_executorMemory_aarch64=25g +com_orkut_4dgn_Identical_numPartitions_aarch64=280 +com_orkut_4dgn_Identical_executorExtraJavaOptions_aarch64=-Xms25g + +com_orkut_4sqr_Identical_numberTask_aarch64=3000 +com_orkut_4sqr_Identical_numExecutors_aarch64=35 +com_orkut_4sqr_Identical_executorCores_aarch64=8 +com_orkut_4sqr_Identical_executorMemory_aarch64=25g +com_orkut_4sqr_Identical_numPartitions_aarch64=280 +com_orkut_4sqr_Identical_executorExtraJavaOptions_aarch64=-Xms25g + +com_orkut_5tree_Identical_numberTask_aarch64=1000 +com_orkut_5tree_Identical_numExecutors_aarch64=35 +com_orkut_5tree_Identical_executorCores_aarch64=8 +com_orkut_5tree_Identical_executorMemory_aarch64=25g +com_orkut_5tree_Identical_numPartitions_aarch64=280 +com_orkut_5tree_Identical_executorExtraJavaOptions_aarch64=-Xms25g + +com_orkut_6star_Identical_numberTask_aarch64=1000 +com_orkut_6star_Identical_numExecutors_aarch64=35 +com_orkut_6star_Identical_executorCores_aarch64=8 +com_orkut_6star_Identical_executorMemory_aarch64=25g +com_orkut_6star_Identical_numPartitions_aarch64=280 +com_orkut_6star_Identical_executorExtraJavaOptions_aarch64=-Xms25g + +graph500_19_4dgn_unIdentical_numberTask_x86_64=1500 +graph500_19_4dgn_unIdentical_numExecutors_x86_64=29 +graph500_19_4dgn_unIdentical_executorCores_x86_64=8 +graph500_19_4dgn_unIdentical_executorMemory_x86_64=31g +graph500_19_4dgn_unIdentical_numPartitions_x86_64=232 +graph500_19_4dgn_unIdentical_executorExtraJavaOptions_x86_64=-Xms31g + +graph500_19_4clique_unIdentical_numberTask_x86_64=500 +graph500_19_4clique_unIdentical_numExecutors_x86_64=29 +graph500_19_4clique_unIdentical_executorCores_x86_64=8 +graph500_19_4clique_unIdentical_executorMemory_x86_64=31g +graph500_19_4clique_unIdentical_numPartitions_x86_64=232 +graph500_19_4clique_unIdentical_executorExtraJavaOptions_x86_64=-Xms31g + +graph500_19_5clique_unIdentical_numberTask_x86_64=1000 +graph500_19_5clique_unIdentical_numExecutors_x86_64=29 +graph500_19_5clique_unIdentical_executorCores_x86_64=8 +graph500_19_5clique_unIdentical_executorMemory_x86_64=31g +graph500_19_5clique_unIdentical_numPartitions_x86_64=232 +graph500_19_5clique_unIdentical_executorExtraJavaOptions_x86_64=-Xms31g + +graph500_19_6clique_unIdentical_numberTask_x86_64=1000 +graph500_19_6clique_unIdentical_numExecutors_x86_64=29 +graph500_19_6clique_unIdentical_executorCores_x86_64=8 +graph500_19_6clique_unIdentical_executorMemory_x86_64=31g +graph500_19_6clique_unIdentical_numPartitions_x86_64=232 +graph500_19_6clique_unIdentical_executorExtraJavaOptions_x86_64=-Xms31g + +liveJournal_4dgn_unIdentical_numberTask_x86_64=1000 +liveJournal_4dgn_unIdentical_numExecutors_x86_64=29 +liveJournal_4dgn_unIdentical_executorCores_x86_64=8 +liveJournal_4dgn_unIdentical_executorMemory_x86_64=31g +liveJournal_4dgn_unIdentical_numPartitions_x86_64=232 +liveJournal_4dgn_unIdentical_executorExtraJavaOptions_x86_64=-Xms31g + +liveJournal_4clique_unIdentical_numberTask_x86_64=1500 +liveJournal_4clique_unIdentical_numExecutors_x86_64=29 +liveJournal_4clique_unIdentical_executorCores_x86_64=8 +liveJournal_4clique_unIdentical_executorMemory_x86_64=31g +liveJournal_4clique_unIdentical_numPartitions_x86_64=232 +liveJournal_4clique_unIdentical_executorExtraJavaOptions_x86_64=-Xms31g + +liveJournal_5clique_unIdentical_numberTask_x86_64=1000 +liveJournal_5clique_unIdentical_numExecutors_x86_64=39 +liveJournal_5clique_unIdentical_executorCores_x86_64=6 +liveJournal_5clique_unIdentical_executorMemory_x86_64=23g +liveJournal_5clique_unIdentical_numPartitions_x86_64=234 +liveJournal_5clique_unIdentical_executorExtraJavaOptions_x86_64=-Xms23g + +liveJournal_6clique_unIdentical_numberTask_x86_64=2000 +liveJournal_6clique_unIdentical_numExecutors_x86_64=29 +liveJournal_6clique_unIdentical_executorCores_x86_64=8 +liveJournal_6clique_unIdentical_executorMemory_x86_64=31g +liveJournal_6clique_unIdentical_numPartitions_x86_64=232 +liveJournal_6clique_unIdentical_executorExtraJavaOptions_x86_64=-Xms31g + +com_orkut_4dgn_unIdentical_numberTask_x86_64=1500 +com_orkut_4dgn_unIdentical_numExecutors_x86_64=29 +com_orkut_4dgn_unIdentical_executorCores_x86_64=8 +com_orkut_4dgn_unIdentical_executorMemory_x86_64=31g +com_orkut_4dgn_unIdentical_numPartitions_x86_64=232 +com_orkut_4dgn_unIdentical_executorExtraJavaOptions_x86_64=-Xms31g + +com_orkut_4clique_unIdentical_numberTask_x86_64=500 +com_orkut_4clique_unIdentical_numExecutors_x86_64=29 +com_orkut_4clique_unIdentical_executorCores_x86_64=8 +com_orkut_4clique_unIdentical_executorMemory_x86_64=31g +com_orkut_4clique_unIdentical_numPartitions_x86_64=232 +com_orkut_4clique_unIdentical_executorExtraJavaOptions_x86_64=-Xms31g + +com_orkut_5clique_unIdentical_numberTask_x86_64=500 +com_orkut_5clique_unIdentical_numExecutors_x86_64=29 +com_orkut_5clique_unIdentical_executorCores_x86_64=8 +com_orkut_5clique_unIdentical_executorMemory_x86_64=31g +com_orkut_5clique_unIdentical_numPartitions_x86_64=232 +com_orkut_5clique_unIdentical_executorExtraJavaOptions_x86_64=-Xms31g + +com_orkut_6clique_unIdentical_numberTask_x86_64=1000 +com_orkut_6clique_unIdentical_numExecutors_x86_64=39 +com_orkut_6clique_unIdentical_executorCores_x86_64=6 +com_orkut_6clique_unIdentical_executorMemory_x86_64=23g +com_orkut_6clique_unIdentical_numPartitions_x86_64=234 +com_orkut_6clique_unIdentical_executorExtraJavaOptions_x86_64=-Xms23g + +graph500_19_4dgn_Identical_numberTask_x86_64=2000 +graph500_19_4dgn_Identical_numExecutors_x86_64=29 +graph500_19_4dgn_Identical_executorCores_x86_64=8 +graph500_19_4dgn_Identical_executorMemory_x86_64=31g +graph500_19_4dgn_Identical_numPartitions_x86_64=232 +graph500_19_4dgn_Identical_executorExtraJavaOptions_x86_64=-Xms31g + +graph500_19_4sqr_Identical_numberTask_x86_64=1000 +graph500_19_4sqr_Identical_numExecutors_x86_64=29 +graph500_19_4sqr_Identical_executorCores_x86_64=8 +graph500_19_4sqr_Identical_executorMemory_x86_64=31g +graph500_19_4sqr_Identical_numPartitions_x86_64=232 +graph500_19_4sqr_Identical_executorExtraJavaOptions_x86_64=-Xms31g + +graph500_19_5tree_Identical_numberTask_x86_64=500 +graph500_19_5tree_Identical_numExecutors_x86_64=29 +graph500_19_5tree_Identical_executorCores_x86_64=8 +graph500_19_5tree_Identical_executorMemory_x86_64=31g +graph500_19_5tree_Identical_numPartitions_x86_64=232 +graph500_19_5tree_Identical_executorExtraJavaOptions_x86_64=-Xms31g + +graph500_19_6star_Identical_numberTask_x86_64=1000 +graph500_19_6star_Identical_numExecutors_x86_64=29 +graph500_19_6star_Identical_executorCores_x86_64=8 +graph500_19_6star_Identical_executorMemory_x86_64=31g +graph500_19_6star_Identical_numPartitions_x86_64=232 +graph500_19_6star_Identical_executorExtraJavaOptions_x86_64=-Xms31g + +liveJournal_4dgn_Identical_numberTask_x86_64=2000 +liveJournal_4dgn_Identical_numExecutors_x86_64=29 +liveJournal_4dgn_Identical_executorCores_x86_64=8 +liveJournal_4dgn_Identical_executorMemory_x86_64=31g +liveJournal_4dgn_Identical_numPartitions_x86_64=232 +liveJournal_4dgn_Identical_executorExtraJavaOptions_x86_64=-Xms31g + +liveJournal_4sqr_Identical_numberTask_x86_64=2000 +liveJournal_4sqr_Identical_numExecutors_x86_64=29 +liveJournal_4sqr_Identical_executorCores_x86_64=8 +liveJournal_4sqr_Identical_executorMemory_x86_64=31g +liveJournal_4sqr_Identical_numPartitions_x86_64=232 +liveJournal_4sqr_Identical_executorExtraJavaOptions_x86_64=-Xms31g + +liveJournal_5tree_Identical_numberTask_x86_64=1000 +liveJournal_5tree_Identical_numExecutors_x86_64=29 +liveJournal_5tree_Identical_executorCores_x86_64=8 +liveJournal_5tree_Identical_executorMemory_x86_64=31g +liveJournal_5tree_Identical_numPartitions_x86_64=232 +liveJournal_5tree_Identical_executorExtraJavaOptions_x86_64=-Xms31g + +liveJournal_6star_Identical_numberTask_x86_64=1000 +liveJournal_6star_Identical_numExecutors_x86_64=29 +liveJournal_6star_Identical_executorCores_x86_64=8 +liveJournal_6star_Identical_executorMemory_x86_64=31g +liveJournal_6star_Identical_numPartitions_x86_64=232 +liveJournal_6star_Identical_executorExtraJavaOptions_x86_64=-Xms31g + +com_orkut_4dgn_Identical_numberTask_x86_64=3000 +com_orkut_4dgn_Identical_numExecutors_x86_64=29 +com_orkut_4dgn_Identical_executorCores_x86_64=8 +com_orkut_4dgn_Identical_executorMemory_x86_64=31g +com_orkut_4dgn_Identical_numPartitions_x86_64=232 +com_orkut_4dgn_Identical_executorExtraJavaOptions_x86_64=-Xms31g + +com_orkut_4sqr_Identical_numberTask_x86_64=3000 +com_orkut_4sqr_Identical_numExecutors_x86_64=29 +com_orkut_4sqr_Identical_executorCores_x86_64=8 +com_orkut_4sqr_Identical_executorMemory_x86_64=31g +com_orkut_4sqr_Identical_numPartitions_x86_64=232 +com_orkut_4sqr_Identical_executorExtraJavaOptions_x86_64=-Xms31g + +com_orkut_5tree_Identical_numberTask_x86_64=1000 +com_orkut_5tree_Identical_numExecutors_x86_64=29 +com_orkut_5tree_Identical_executorCores_x86_64=8 +com_orkut_5tree_Identical_executorMemory_x86_64=31g +com_orkut_5tree_Identical_numPartitions_x86_64=232 +com_orkut_5tree_Identical_executorExtraJavaOptions_x86_64=-Xms31g + +com_orkut_6star_Identical_numberTask_x86_64=500 +com_orkut_6star_Identical_numExecutors_x86_64=29 +com_orkut_6star_Identical_executorCores_x86_64=8 +com_orkut_6star_Identical_executorMemory_x86_64=31g +com_orkut_6star_Identical_numPartitions_x86_64=232 +com_orkut_6star_Identical_executorExtraJavaOptions_x86_64=-Xms31g + +# raw +numberColors=12 +numExecutors=29 +executorCores=8 +executorMemory=31g +numPartitions=232 +executorExtraJavaOptions=-Xms31g + +graph500_19_split="," +liveJournal_split="\t" +com_orkut_split="\t" \ No newline at end of file diff --git a/tools/kal-test/conf/graph/tc/tc.yml b/tools/kal-test/conf/graph/tc/tc.yml new file mode 100644 index 0000000..fa3438c --- /dev/null +++ b/tools/kal-test/conf/graph/tc/tc.yml @@ -0,0 +1,8 @@ +partition: 720 +split: + graph500_22: " " + graph500_23: " " + graph500_24: " " + graph500_25: " " + graph500_26: " " + diff --git a/tools/kal-test/conf/graph/tc/tc_spark.properties b/tools/kal-test/conf/graph/tc/tc_spark.properties new file mode 100644 index 0000000..2f3a8b6 --- /dev/null +++ b/tools/kal-test/conf/graph/tc/tc_spark.properties @@ -0,0 +1,11 @@ +numExecutors_aarch64=74 +numExecutors_x86_64=59 +executorCores=4 +executorMemory_aarch64=12g +executorMemory_x86_64=16g +extraJavaOptions_aarch64=-Xms12g +extraJavaOptions_x86_64=-Xms16g +master=yarn +deployMode=client +driverCores=36 +driverMemory=50g diff --git a/tools/kal-test/conf/graph/tpr/tpr.yml b/tools/kal-test/conf/graph/tpr/tpr.yml new file mode 100644 index 0000000..c561ddc --- /dev/null +++ b/tools/kal-test/conf/graph/tpr/tpr.yml @@ -0,0 +1,23 @@ +tpr: + twitter_tpr: + opt: + splitGraph: "," + numIter: 100 + numPartitions: 273 + resetProb: 0.15 + isOnlySrc: false + + raw: + splitGraph: "," + numIter: 100 + numPartitions: 236 + resetProb: 0.15 + isOnlySrc: false + + twitter_2010: + opt: + splitGraph: "\t" + numIter: 100 + numPartitions: 273 + resetProb: 0.15 + isOnlySrc: false diff --git a/tools/kal-test/conf/graph/tpr/tpr_spark.properties b/tools/kal-test/conf/graph/tpr/tpr_spark.properties new file mode 100644 index 0000000..bdebff8 --- /dev/null +++ b/tools/kal-test/conf/graph/tpr/tpr_spark.properties @@ -0,0 +1,11 @@ +deployMode=client + +numExectuors_twitter_tpr_aarch64=39 +executorCores_twitter_tpr_aarch64=7 +executorMemory_twitter_tpr_aarch64=23G +extraJavaOptions_twitter_tpr_aarch64=-Xms23g + +numExectuors_twitter_tpr_x86_64=59 +executorCores_twitter_tpr_x86_64=4 +executorMemory_twitter_tpr_x86_64=15G +extraJavaOptions_twitter_tpr_x86_64=-Xms15g diff --git a/tools/kal-test/conf/graph/tr/tr.yml b/tools/kal-test/conf/graph/tr/tr.yml new file mode 100644 index 0000000..09622ae --- /dev/null +++ b/tools/kal-test/conf/graph/tr/tr.yml @@ -0,0 +1,64 @@ +trustRank: + cit_patents_100: + seedsPath: hdfs:///tmp/graph/dataset/tr_seeds/cit-Patents_100 + splitGraph: "\t" + numIter: 100 + resetProb: 0.15 + tolerance: 1e-8 + + cit_patents_500: + seedsPath: hdfs:///tmp/graph/dataset/tr_seeds/cit-Patents_500 + splitGraph: "\t" + numIter: 100 + resetProb: 0.15 + tolerance: 1e-8 + + cit_patents_1000: + seedsPath: hdfs:///tmp/graph/dataset/tr_seeds/cit-Patents_1000 + splitGraph: "\t" + numIter: 100 + resetProb: 0.15 + tolerance: 1e-8 + + uk_2002_100: + seedsPath: hdfs:///tmp/graph/dataset/tr_seeds/uk-2002-edgelist_100 + splitGraph: "\t" + numIter: 100 + resetProb: 0.15 + tolerance: 1e-8 + + uk_2002_500: + seedsPath: hdfs:///tmp/graph/dataset/tr_seeds/uk-2002-edgelist_500 + splitGraph: "\t" + numIter: 100 + resetProb: 0.15 + tolerance: 1e-8 + + uk_2002_1000: + seedsPath: hdfs:///tmp/graph/dataset/tr_seeds/uk-2002-edgelist_1000 + splitGraph: "\t" + numIter: 100 + resetProb: 0.15 + tolerance: 1e-8 + + arabic_2005_100: + seedsPath: hdfs:///tmp/graph/dataset/tr_seeds/arabic-2005-edgelist_100 + splitGraph: "\t" + numIter: 100 + resetProb: 0.15 + tolerance: 1e-8 + + arabic_2005_500: + outputPath: hdfs:///tmp/graph/result/tr/arabic_2005_500 + seedsPath: hdfs:///tmp/graph/dataset/tr_seeds/arabic-2005-edgelist_500 + splitGraph: "\t" + numIter: 100 + resetProb: 0.15 + tolerance: 1e-8 + + arabic_2005_1000: + seedsPath: hdfs:///tmp/graph/dataset/tr_seeds/arabic-2005-edgelist_1000 + splitGraph: "\t" + numIter: 100 + resetProb: 0.15 + tolerance: 1e-8 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/tr/tr_spark.properties b/tools/kal-test/conf/graph/tr/tr_spark.properties new file mode 100644 index 0000000..00b920d --- /dev/null +++ b/tools/kal-test/conf/graph/tr/tr_spark.properties @@ -0,0 +1,219 @@ +# Spark parameters +deployMode=client +driverMemory=100g + +runUntilConvergence_cit_patents_100_numExecutors_aarch64=12 +runUntilConvergence_cit_patents_100_executorCores_aarch64=23 +runUntilConvergence_cit_patents_100_executorMemory_aarch64=79G +runUntilConvergence_cit_patents_100_extraJavaOptions_aarch64=-Xms79G +runUntilConvergence_cit_patents_100_numPartitions_aarch64=72 + +runUntilConvergence_cit_patents_500_numExecutors_aarch64=12 +runUntilConvergence_cit_patents_500_executorCores_aarch64=23 +runUntilConvergence_cit_patents_500_executorMemory_aarch64=79G +runUntilConvergence_cit_patents_500_extraJavaOptions_aarch64=-Xms79G +runUntilConvergence_cit_patents_500_numPartitions_aarch64=102 + +runUntilConvergence_cit_patents_1000_numExecutors_aarch64=12 +runUntilConvergence_cit_patents_1000_executorCores_aarch64=23 +runUntilConvergence_cit_patents_1000_executorMemory_aarch64=79G +runUntilConvergence_cit_patents_1000_extraJavaOptions_aarch64=-Xms79G +runUntilConvergence_cit_patents_1000_numPartitions_aarch64=72 + +runUntilConvergence_uk_2002_100_numExecutors_aarch64=12 +runUntilConvergence_uk_2002_100_executorCores_aarch64=23 +runUntilConvergence_uk_2002_100_executorMemory_aarch64=79G +runUntilConvergence_uk_2002_100_extraJavaOptions_aarch64=-Xms79G +runUntilConvergence_uk_2002_100_numPartitions_aarch64=120 + +runUntilConvergence_uk_2002_500_numExecutors_aarch64=12 +runUntilConvergence_uk_2002_500_executorCores_aarch64=23 +runUntilConvergence_uk_2002_500_executorMemory_aarch64=79G +runUntilConvergence_uk_2002_500_extraJavaOptions_aarch64=-Xms79G +runUntilConvergence_uk_2002_500_numPartitions_aarch64=102 + +runUntilConvergence_uk_2002_1000_numExecutors_aarch64=12 +runUntilConvergence_uk_2002_1000_executorCores_aarch64=23 +runUntilConvergence_uk_2002_1000_executorMemory_aarch64=79G +runUntilConvergence_uk_2002_1000_extraJavaOptions_aarch64=-Xms79G +runUntilConvergence_uk_2002_1000_numPartitions_aarch64=120 + +runUntilConvergence_arabic_2005_100_numExecutors_aarch64=12 +runUntilConvergence_arabic_2005_100_executorCores_aarch64=23 +runUntilConvergence_arabic_2005_100_executorMemory_aarch64=79G +runUntilConvergence_arabic_2005_100_extraJavaOptions_aarch64=-Xms79G +runUntilConvergence_arabic_2005_100_numPartitions_aarch64=102 + +runUntilConvergence_arabic_2005_500_numExecutors_aarch64=12 +runUntilConvergence_arabic_2005_500_executorCores_aarch64=23 +runUntilConvergence_arabic_2005_500_executorMemory_aarch64=79G +runUntilConvergence_arabic_2005_500_extraJavaOptions_aarch64=-Xms79G +runUntilConvergence_arabic_2005_500_numPartitions_aarch64=120 + +runUntilConvergence_arabic_2005_1000_numExecutors_aarch64=12 +runUntilConvergence_arabic_2005_1000_executorCores_aarch64=23 +runUntilConvergence_arabic_2005_1000_executorMemory_aarch64=79G +runUntilConvergence_arabic_2005_1000_extraJavaOptions_aarch64=-Xms79G +runUntilConvergence_arabic_2005_1000_numPartitions_aarch64=120 + +runUntilConvergence_cit_patents_100_numExecutors_x86_64=12 +runUntilConvergence_cit_patents_100_executorCores_x86_64=19 +runUntilConvergence_cit_patents_100_executorMemory_x86_64=79G +runUntilConvergence_cit_patents_100_extraJavaOptions_x86_64=-Xms79G +runUntilConvergence_cit_patents_100_numPartitions_x86_64=72 + +runUntilConvergence_cit_patents_500_numExecutors_x86_64=12 +runUntilConvergence_cit_patents_500_executorCores_x86_64=19 +runUntilConvergence_cit_patents_500_executorMemory_x86_64=79G +runUntilConvergence_cit_patents_500_extraJavaOptions_x86_64=-Xms79G +runUntilConvergence_cit_patents_500_numPartitions_x86_64=72 + +runUntilConvergence_cit_patents_1000_numExecutors_x86_64=12 +runUntilConvergence_cit_patents_1000_executorCores_x86_64=19 +runUntilConvergence_cit_patents_1000_executorMemory_x86_64=79G +runUntilConvergence_cit_patents_1000_extraJavaOptions_x86_64=-Xms79G +runUntilConvergence_cit_patents_1000_numPartitions_x86_64=72 + +runUntilConvergence_uk_2002_100_numExecutors_x86_64=12 +runUntilConvergence_uk_2002_100_executorCores_x86_64=19 +runUntilConvergence_uk_2002_100_executorMemory_x86_64=79G +runUntilConvergence_uk_2002_100_extraJavaOptions_x86_64=-Xms79G +runUntilConvergence_uk_2002_100_numPartitions_x86_64=120 + +runUntilConvergence_uk_2002_500_numExecutors_x86_64=12 +runUntilConvergence_uk_2002_500_executorCores_x86_64=19 +runUntilConvergence_uk_2002_500_executorMemory_x86_64=79G +runUntilConvergence_uk_2002_500_extraJavaOptions_x86_64=-Xms79G +runUntilConvergence_uk_2002_500_numPartitions_x86_64=102 + +runUntilConvergence_uk_2002_1000_numExecutors_x86_64=12 +runUntilConvergence_uk_2002_1000_executorCores_x86_64=19 +runUntilConvergence_uk_2002_1000_executorMemory_x86_64=79G +runUntilConvergence_uk_2002_1000_extraJavaOptions_x86_64=-Xms79G +runUntilConvergence_uk_2002_1000_numPartitions_x86_64=102 + +runUntilConvergence_arabic_2005_100_numExecutors_x86_64=12 +runUntilConvergence_arabic_2005_100_executorCores_x86_64=19 +runUntilConvergence_arabic_2005_100_executorMemory_x86_64=79G +runUntilConvergence_arabic_2005_100_extraJavaOptions_x86_64=-Xms79G +runUntilConvergence_arabic_2005_100_numPartitions_x86_64=120 + +runUntilConvergence_arabic_2005_500_numExecutors_x86_64=12 +runUntilConvergence_arabic_2005_500_executorCores_x86_64=19 +runUntilConvergence_arabic_2005_500_executorMemory_x86_64=79G +runUntilConvergence_arabic_2005_500_extraJavaOptions_x86_64=-Xms79G +runUntilConvergence_arabic_2005_500_numPartitions_x86_64=102 + +runUntilConvergence_arabic_2005_1000_numExecutors_x86_64=12 +runUntilConvergence_arabic_2005_1000_executorCores_x86_64=19 +runUntilConvergence_arabic_2005_1000_executorMemory_x86_64=79G +runUntilConvergence_arabic_2005_1000_extraJavaOptions_x86_64=-Xms79G +runUntilConvergence_arabic_2005_1000_numPartitions_x86_64=120 + +run_cit_patents_100_numExecutors_aarch64=12 +run_cit_patents_100_executorCores_aarch64=23 +run_cit_patents_100_executorMemory_aarch64=79G +run_cit_patents_100_extraJavaOptions_aarch64=-Xms79G +run_cit_patents_100_numPartitions_aarch64=102 + +run_cit_patents_500_numExecutors_aarch64=12 +run_cit_patents_500_executorCores_aarch64=23 +run_cit_patents_500_executorMemory_aarch64=79G +run_cit_patents_500_extraJavaOptions_aarch64=-Xms79G +run_cit_patents_500_numPartitions_aarch64=72 + +run_cit_patents_1000_numExecutors_aarch64=12 +run_cit_patents_1000_executorCores_aarch64=23 +run_cit_patents_1000_executorMemory_aarch64=79G +run_cit_patents_1000_extraJavaOptions_aarch64=-Xms79G +run_cit_patents_1000_numPartitions_aarch64=102 + +run_uk_2002_100_numExecutors_aarch64=12 +run_uk_2002_100_executorCores_aarch64=23 +run_uk_2002_100_executorMemory_aarch64=79G +run_uk_2002_100_extraJavaOptions_aarch64=-Xms79G +run_uk_2002_100_numPartitions_aarch64=120 + +run_uk_2002_500_numExecutors_aarch64=12 +run_uk_2002_500_executorCores_aarch64=23 +run_uk_2002_500_executorMemory_aarch64=79G +run_uk_2002_500_extraJavaOptions_aarch64=-Xms79G +run_uk_2002_500_numPartitions_aarch64=120 + +run_uk_2002_1000_numExecutors_aarch64=12 +run_uk_2002_1000_executorCores_aarch64=23 +run_uk_2002_1000_executorMemory_aarch64=79G +run_uk_2002_1000_extraJavaOptions_aarch64=-Xms79G +run_uk_2002_1000_numPartitions_aarch64=102 + +run_arabic_2005_100_numExecutors_aarch64=12 +run_arabic_2005_100_executorCores_aarch64=23 +run_arabic_2005_100_executorMemory_aarch64=79G +run_arabic_2005_100_extraJavaOptions_aarch64=-Xms79G +run_arabic_2005_100_numPartitions_aarch64=120 + +run_arabic_2005_500_numExecutors_aarch64=12 +run_arabic_2005_500_executorCores_aarch64=23 +run_arabic_2005_500_executorMemory_aarch64=79G +run_arabic_2005_500_extraJavaOptions_aarch64=-Xms79G +run_arabic_2005_500_numPartitions_aarch64=120 + +run_arabic_2005_1000_numExecutors_aarch64=12 +run_arabic_2005_1000_executorCores_aarch64=23 +run_arabic_2005_1000_executorMemory_aarch64=79G +run_arabic_2005_1000_extraJavaOptions_aarch64=-Xms79G +run_arabic_2005_1000_numPartitions_aarch64=120 + +run_cit_patents_100_numExecutors_x86_64=12 +run_cit_patents_100_executorCores_x86_64=19 +run_cit_patents_100_executorMemory_x86_64=79G +run_cit_patents_100_extraJavaOptions_x86_64=-Xms79G +run_cit_patents_100_numPartitions_x86_64=102 + +run_cit_patents_500_numExecutors_x86_64=12 +run_cit_patents_500_executorCores_x86_64=19 +run_cit_patents_500_executorMemory_x86_64=79G +run_cit_patents_500_extraJavaOptions_x86_64=-Xms79G +run_cit_patents_500_numPartitions_x86_64=72 + +run_cit_patents_1000_numExecutors_x86_64=12 +run_cit_patents_1000_executorCores_x86_64=19 +run_cit_patents_1000_executorMemory_x86_64=79G +run_cit_patents_1000_extraJavaOptions_x86_64=-Xms79G +run_cit_patents_1000_numPartitions_x86_64=72 + +run_uk_2002_100_numExecutors_x86_64=12 +run_uk_2002_100_executorCores_x86_64=19 +run_uk_2002_100_executorMemory_x86_64=79G +run_uk_2002_100_extraJavaOptions_x86_64=-Xms79G +run_uk_2002_100_numPartitions_x86_64=120 + +run_uk_2002_500_numExecutors_x86_64=12 +run_uk_2002_500_executorCores_x86_64=19 +run_uk_2002_500_executorMemory_x86_64=79G +run_uk_2002_500_extraJavaOptions_x86_64=-Xms79G +run_uk_2002_500_numPartitions_x86_64=102 + +run_uk_2002_1000_numExecutors_x86_64=12 +run_uk_2002_1000_executorCores_x86_64=19 +run_uk_2002_1000_executorMemory_x86_64=79G +run_uk_2002_1000_extraJavaOptions_x86_64=-Xms79G +run_uk_2002_1000_numPartitions_x86_64=102 + +run_arabic_2005_100_numExecutors_x86_64=12 +run_arabic_2005_100_executorCores_x86_64=19 +run_arabic_2005_100_executorMemory_x86_64=19G +run_arabic_2005_100_extraJavaOptions_x86_64=-Xms19G +run_arabic_2005_100_numPartitions_x86_64=120 + +run_arabic_2005_500_numExecutors_x86_64=12 +run_arabic_2005_500_executorCores_x86_64=19 +run_arabic_2005_500_executorMemory_x86_64=79G +run_arabic_2005_500_extraJavaOptions_x86_64=-Xms79G +run_arabic_2005_500_numPartitions_x86_64=120 + +run_arabic_2005_1000_numExecutors_x86_64=12 +run_arabic_2005_1000_executorCores_x86_64=19 +run_arabic_2005_1000_executorMemory_x86_64=79G +run_arabic_2005_1000_extraJavaOptions_x86_64=-Xms79G +run_arabic_2005_1000_numPartitions_x86_64=120 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/wce/wce.yml b/tools/kal-test/conf/graph/wce/wce.yml new file mode 100644 index 0000000..cdfff8a --- /dev/null +++ b/tools/kal-test/conf/graph/wce/wce.yml @@ -0,0 +1,20 @@ +# WCE dataset parameters + +wce: + graph500_24: + splitGraph: " " + maxIterations: 1000 + maxDegree: 2000 + numPartitions: 284 + + graph500_25: + splitGraph: " " + maxIterations: 1000 + maxDegree: 2000 + numPartitions: 284 + + graph500_26: + splitGraph: " " + maxIterations: 1000 + maxDegree: 2000 + numPartitions: 284 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/wce/wce_spark.properties b/tools/kal-test/conf/graph/wce/wce_spark.properties new file mode 100644 index 0000000..f5c27d9 --- /dev/null +++ b/tools/kal-test/conf/graph/wce/wce_spark.properties @@ -0,0 +1,30 @@ +# Spark parameters +graph500_24_numExectuors_aarch64=71 +graph500_24_executorCores_aarch64=4 +graph500_24_executorMemory_aarch64=12G +graph500_24_extraJavaOptions_aarch64=-Xms12g + +graph500_25_numExectuors_aarch64=71 +graph500_25_executorCores_aarch64=4 +graph500_25_executorMemory_aarch64=12G +graph500_25_extraJavaOptions_aarch64=-Xms12g + +graph500_26_numExectuors_aarch64=71 +graph500_26_executorCores_aarch64=4 +graph500_26_executorMemory_aarch64=12G +graph500_26_extraJavaOptions_aarch64=-Xms12g + +graph500_24_numExectuors_x86_64=59 +graph500_24_executorCores_x86_64=4 +graph500_24_executorMemory_x86_64=15G +graph500_24_extraJavaOptions_x86_64=-Xms15g + +graph500_25_numExectuors_x86_64=59 +graph500_25_executorCores_x86_64=4 +graph500_25_executorMemory_x86_64=15G +graph500_25_extraJavaOptions_x86_64=-Xms15g + +graph500_26_numExectuors_x86_64=59 +graph500_26_executorCores_x86_64=4 +graph500_26_executorMemory_x86_64=15G +graph500_26_extraJavaOptions_x86_64=-Xms15g \ No newline at end of file diff --git a/tools/kal-test/conf/graph/wlpa/wlpa.yml b/tools/kal-test/conf/graph/wlpa/wlpa.yml new file mode 100644 index 0000000..bfd37e5 --- /dev/null +++ b/tools/kal-test/conf/graph/wlpa/wlpa.yml @@ -0,0 +1,32 @@ +wlpa: + opt: + enwiki_2018: + splitGraph: "\\s+" + commputePartition: 282 + maxIter: 10 + + arabic_2005: + splitGraph: "\\s+" + commputePartition: 282 + maxIter: 10 + + GAP_twitter: + splitGraph: "\\s+" + commputePartition: 282 + maxIter: 10 + + raw: + enwiki_2018: + splitGraph: "\\s+" + commputePartition: 236 + maxIter: 10 + + arabic_2005: + splitGraph: "\\s+" + commputePartition: 236 + maxIter: 10 + + GAP_twitter: + splitGraph: "\\s+" + commputePartition: 236 + maxIter: 10 diff --git a/tools/kal-test/conf/graph/wlpa/wlpa_spark.properties b/tools/kal-test/conf/graph/wlpa/wlpa_spark.properties new file mode 100644 index 0000000..0820026 --- /dev/null +++ b/tools/kal-test/conf/graph/wlpa/wlpa_spark.properties @@ -0,0 +1,33 @@ +numExecutors_enwiki_2018_aarch64=47 +executorMemory_enwiki_2018_aarch64=19g +executorCores_enwiki_2018_aarch64=6 +executorExtraJavaopts_enwiki_2018_aarch64="-Xms19g" + +numExecutors_arabic_2005_aarch64=47 +executorMemory_arabic_2005_aarch64=19g +executorCores_arabic_2005_aarch64=6 +executorExtraJavaopts_arabic_2005_aarch64="-Xms19g" + +numExecutors_GAP_twitter_aarch64=47 +executorMemory_GAP_twitter_aarch64=19g +executorCores_GAP_twitter_aarch64=6 +executorExtraJavaopts_GAP_twitter_aarch64="-Xms19g" + +numExecutors_enwiki_2018_x86_64=59 +executorMemory_enwiki_2018_x86_64=15g +executorCores_enwiki_2018_x86_64=4 +executorExtraJavaopts_enwiki_2018_x86_64="-Xms15g" + +numExecutors_arabic_2005_x86_64=59 +executorMemory_arabic_2005_x86_64=15g +executorCores_arabic_2005_x86_64=4 +executorExtraJavaopts_arabic_2005_x86_64="-Xms15g" + +numExecutors_GAP_twitter_x86_64=59 +executorMemory_GAP_twitter_x86_64=15g +executorCores_GAP_twitter_x86_64=4 +executorExtraJavaopts_GAP_twitter_x86_64="-Xms15g" + +master=yarn +deployMode=client +driverMemory=300g \ No newline at end of file diff --git a/tools/kal-test/conf/graph/wpr/wpr.yml b/tools/kal-test/conf/graph/wpr/wpr.yml new file mode 100644 index 0000000..4e8107c --- /dev/null +++ b/tools/kal-test/conf/graph/wpr/wpr.yml @@ -0,0 +1,30 @@ +# PageRank dataset parameters arm + +wpr: + static: + cage14: + partitionNum: 60 + numIter: 100 + tolerance: 0 + GAP_road: + partitionNum: 102 + numIter: 100 + tolerance: 0 + GAP_twitter: + partitionNum: 120 + numIter: 100 + tolerance: 0 + + convergence: + cage14: + partitionNum: 60 + numIter: 200 + tolerance: 1e-7 + GAP_road: + partitionNum: 120 + numIter: 200 + tolerance: 1e-7 + GAP_twitter: + partitionNum: 180 + numIter: 200 + tolerance: 1e-7 \ No newline at end of file diff --git a/tools/kal-test/conf/graph/wpr/wpr_spark.properties b/tools/kal-test/conf/graph/wpr/wpr_spark.properties new file mode 100644 index 0000000..c7041bb --- /dev/null +++ b/tools/kal-test/conf/graph/wpr/wpr_spark.properties @@ -0,0 +1,51 @@ +#spark parameters +deployMode=client + +# opt +numExecutors_aarch64=12 +executorCores_aarch64=15 +executorMemory_aarch64=79G +extraJavaOptions_aarch64=-Xms79G + +# raw +static_cage14_numExecutors=12 +static_cage14_executorCores=15 +static_cage14_executorMemory=79G +static_cage14_extraJavaOptions=-Xms79G +static_cage14_partition=60 + +static_GAP_road_numExecutors=12 +static_GAP_road_executorCores=15 +static_GAP_road_executorMemory=79G +static_GAP_road_extraJavaOptions=-Xms79G +static_GAP_road_partition=102 + +static_GAP_twitter_numExecutors=36 +static_GAP_twitter_executorCores=5 +static_GAP_twitter_executorMemory=26G +static_GAP_twitter_extraJavaOptions=-Xms26G +static_GAP_twitter_partition=120 + +convergence_cage14_numExecutors=36 +convergence_cage14_executorCores=5 +convergence_cage14_executorMemory=26G +convergence_cage14_extraJavaOptions=-Xms26G +convergence_cage14_partition=102 + +convergence_GAP_road_numExecutors=54 +convergence_GAP_road_executorCores=3 +convergence_GAP_road_executorMemory=17G +convergence_GAP_road_extraJavaOptions=-Xms17G +convergence_GAP_road_partition=102 + +convergence_GAP_twitter_numExecutors=12 +convergence_GAP_twitter_executorCores=15 +convergence_GAP_twitter_executorMemory=79G +convergence_GAP_twitter_extraJavaOptions=-Xms79G +convergence_GAP_twitter_partition=150 + +static_iter=100 +static_tolerance=0 +convergence_iter=200 +convergence_tolerance=1e-7 +split_graph="\\s+" diff --git a/tools/kal-test/conf/ml/als/als.yml b/tools/kal-test/conf/ml/als/als.yml new file mode 100644 index 0000000..abc6e2b --- /dev/null +++ b/tools/kal-test/conf/ml/als/als.yml @@ -0,0 +1,127 @@ +#ALS model params + +als: + opt: + dataframe: + als: + pt: 276 + numIterations: 200 + nonnegative: false + implicitPrefs: false + numItemBlocks: 228 + numUserBlocks: 228 + regParam: 0.0 + alpha: 1.0 + + alsbs: + pt: 276 + numIterations: 200 + nonnegative: false + implicitPrefs: false + numItemBlocks: 228 + numUserBlocks: 228 + regParam: 0.0 + alpha: 1.0 + + alsh: + pt: 276 + numIterations: 200 + nonnegative: false + implicitPrefs: false + numItemBlocks: 228 + numUserBlocks: 228 + regParam: 0.0 + alpha: 1.0 + + rdd: + als: + pt: 276 + numIterations: 200 + nonnegative: false + implicitPrefs: false + numItemBlocks: 228 + numUserBlocks: 228 + regParam: 0.0 + alpha: 1.0 + + alsbs: + pt: 276 + numIterations: 200 + nonnegative: false + implicitPrefs: false + numItemBlocks: 228 + numUserBlocks: 228 + regParam: 0.0 + alpha: 1.0 + + alsh: + pt: 276 + numIterations: 200 + nonnegative: false + implicitPrefs: false + numItemBlocks: 228 + numUserBlocks: 228 + regParam: 0.0 + alpha: 1.0 + raw: + dataframe: + als: + pt: 228 + numIterations: 200 + nonnegative: false + implicitPrefs: false + numItemBlocks: 228 + numUserBlocks: 228 + regParam: 0.0 + alpha: 1.0 + + alsbs: + pt: 228 + numIterations: 200 + nonnegative: false + implicitPrefs: false + numItemBlocks: 228 + numUserBlocks: 228 + regParam: 0.0 + alpha: 1.0 + + alsh: + pt: 228 + numIterations: 200 + nonnegative: false + implicitPrefs: false + numItemBlocks: 228 + numUserBlocks: 228 + regParam: 0.0 + alpha: 1.0 + + rdd: + als: + pt: 228 + numIterations: 200 + nonnegative: false + implicitPrefs: false + numItemBlocks: 228 + numUserBlocks: 228 + regParam: 0.0 + alpha: 1.0 + + alsbs: + pt: 228 + numIterations: 200 + nonnegative: false + implicitPrefs: false + numItemBlocks: 228 + numUserBlocks: 228 + regParam: 0.0 + alpha: 1.0 + + alsh: + pt: 228 + numIterations: 200 + nonnegative: false + implicitPrefs: false + numItemBlocks: 228 + numUserBlocks: 228 + regParam: 0.0 + alpha: 1.0 diff --git a/tools/kal-test/conf/ml/als/als_spark.properties b/tools/kal-test/conf/ml/als/als_spark.properties new file mode 100644 index 0000000..e5ed257 --- /dev/null +++ b/tools/kal-test/conf/ml/als/als_spark.properties @@ -0,0 +1,15 @@ +# Spark parameters +master=yarn +deployMode=client + +driverCores_aarch64=36 +driverMemory_aarch64=50G +numExectuors_aarch64=12 +executorCores_aarch64=23 +executorMemory_aarch64=79G + +driverCores_x86_64=30 +driverMemory_x86_64=50G +numExectuors_x86_64=12 +executorCores_x86_64=19 +executorMemory_x86_64=79G diff --git a/tools/kal-test/conf/ml/bo/bo.yml b/tools/kal-test/conf/ml/bo/bo.yml new file mode 100644 index 0000000..af32ac5 --- /dev/null +++ b/tools/kal-test/conf/ml/bo/bo.yml @@ -0,0 +1,22 @@ +#BayesianOptimization model params + +bo: + opt: + BostonHousing: + partitionNum: 1 + + TitanicRf: + partitionNum: 1 + + TitanicGBT: + partitionNum: 1 + + raw: + BostonHousing: + partitionNum: 1 + + TitanicRf: + partitionNum: 1 + + TitanicGBT: + partitionNum: 1 \ No newline at end of file diff --git a/tools/kal-test/conf/ml/bo/bo_spark.properties b/tools/kal-test/conf/ml/bo/bo_spark.properties new file mode 100644 index 0000000..4d6ccfe --- /dev/null +++ b/tools/kal-test/conf/ml/bo/bo_spark.properties @@ -0,0 +1,15 @@ +# Spark parameters +master=yarn +deployMode=client + +aarch64_driverCores=36 +aarch64_driverMemory=50G +aarch64_numExecutors=5 +aarch64_executorCores=5 +aarch64_executorMemory=20G + +x86_64_driverCores=36 +x86_64_driverMemory=50G +x86_64_numExecutors=5 +x86_64_executorCores=5 +x86_64_executorMemory=20G diff --git a/tools/kal-test/conf/ml/cov/cov.yml b/tools/kal-test/conf/ml/cov/cov.yml new file mode 100644 index 0000000..c3a21df --- /dev/null +++ b/tools/kal-test/conf/ml/cov/cov.yml @@ -0,0 +1,22 @@ +#Covariance model params + +cov: + opt: + CP10M1K: + numPartitions: 280 + + CP2M5K: + numPartitions: 280 + + CP1M10K: + numPartitions: 280 + + raw: + CP10M1K: + numPartitions: 234 + + CP2M5K: + numPartitions: 234 + + CP1M10K: + numPartitions: 234 diff --git a/tools/kal-test/conf/ml/cov/cov_spark.properties b/tools/kal-test/conf/ml/cov/cov_spark.properties new file mode 100644 index 0000000..bb32470 --- /dev/null +++ b/tools/kal-test/conf/ml/cov/cov_spark.properties @@ -0,0 +1,19 @@ +# Spark parameters +master=yarn +deployMode=client +driverMaxResultSize=256g +driverCores_aarch64=36 +driverMemory_aarch64=50G +numExectuors_aarch64=15 +executorCores_aarch64=19 +executorMemory_aarch64=63G +executorMemOverhead_aarch64=5G +extraJavaOptions_aarch64=-Xms63g + +driverCores_x86_64=36 +driverMemory_x86_64=50G +numExectuors_x86_64=18 +executorCores_x86_64=13 +executorMemory_x86_64=50G +executorMemOverhead_x86_64=5G +extraJavaOptions_x86_64=-Xms50g \ No newline at end of file diff --git a/tools/kal-test/conf/ml/dbscan/dbscan.yml b/tools/kal-test/conf/ml/dbscan/dbscan.yml new file mode 100644 index 0000000..6c6581f --- /dev/null +++ b/tools/kal-test/conf/ml/dbscan/dbscan.yml @@ -0,0 +1,20 @@ +#Dbscan model params + +dbscan: + bremenSmall: + numPartitions: 3 + epsilon: 100 + minPoints: 312 + sampleRate: 1.0 + + farm: + numPartitions: 3 + epsilon: 1000 + minPoints: 10 + sampleRate: 1.0 + + house: + numPartitions: 3 + epsilon: 3200 + minPoints: 10 + sampleRate: 1.0 diff --git a/tools/kal-test/conf/ml/dbscan/dbscan_spark.properties b/tools/kal-test/conf/ml/dbscan/dbscan_spark.properties new file mode 100644 index 0000000..f3ad83f --- /dev/null +++ b/tools/kal-test/conf/ml/dbscan/dbscan_spark.properties @@ -0,0 +1,26 @@ +# Spark parameters +master=yarn +deployMode=client +#opt +driverCores_opt=36 +driverMemory_opt=200G +numExectuors_opt=3 +executorCores_opt=64 +executorMemory_opt=316G +extraJavaOptions_opt=-Xms316g + +#raw +driverMaxResultSize_raw=256g +driverCores_raw=30 +driverMemory_raw=200G +numExectuors_raw=12 +executorCores_raw=19 +executorMemory_raw=79G +extraJavaOptions_raw=-Xms79g + +epsilon_bremenSmall_raw=100 +minPoints_bremenSmall_raw=312 +epsilon_farm_raw=1000 +minPoints_farm_raw=10 +epsilon_house_raw=3200 +minPoints_house_raw=10 \ No newline at end of file diff --git a/tools/kal-test/conf/ml/dt/dt.yml b/tools/kal-test/conf/ml/dt/dt.yml new file mode 100644 index 0000000..acbb914 --- /dev/null +++ b/tools/kal-test/conf/ml/dt/dt.yml @@ -0,0 +1,354 @@ +#DT model params + +dt: + opt: + classification: + dataframe: + higgs: + genericPt: 300 + maxMemoryInMB: 5120 + pt: 36 + numCopiesInput: 7 + maxDepth: 17 + maxBins: 512 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + mnist8m: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 7 + maxDepth: 14 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 10 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + epsilon: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 7 + maxDepth: 14 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + rdd: + higgs: + genericPt: 300 + maxMemoryInMB: 5120 + pt: 36 + numCopiesInput: 7 + maxDepth: 17 + maxBins: 512 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + mnist8m: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 7 + maxDepth: 14 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 10 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + epsilon: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 7 + maxDepth: 14 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + regression: + dataframe: + higgs: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 7 + maxDepth: 17 + maxBins: 512 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + mnist8m: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 7 + maxDepth: 15 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 10 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + epsilon: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 7 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + rdd: + higgs: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 7 + maxDepth: 17 + maxBins: 512 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + mnist8m: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 7 + maxDepth: 15 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 10 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + epsilon: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 7 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + + raw: + classification: + dataframe: + higgs: + genericPt: 300 + maxMemoryInMB: 2048 + pt: 72 + numCopiesInput: 1 + maxDepth: 17 + maxBins: 512 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + mnist8m: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 1 + maxDepth: 14 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 10 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + epsilon: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 1 + maxDepth: 14 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + rdd: + higgs: + genericPt: 300 + maxMemoryInMB: 2048 + pt: 72 + numCopiesInput: 1 + maxDepth: 17 + maxBins: 512 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + mnist8m: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 1 + maxDepth: 14 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 10 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + epsilon: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 1 + maxDepth: 14 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + regression: + dataframe: + higgs: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 1 + maxDepth: 17 + maxBins: 512 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + mnist8m: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 1 + maxDepth: 15 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 10 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + epsilon: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 1 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + rdd: + higgs: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 1 + maxDepth: 17 + maxBins: 512 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + mnist8m: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 1 + maxDepth: 15 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 10 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" + epsilon: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 1 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featuresType: "array" + copyStrategy: "normal" + useDFCollPtner: "true" \ No newline at end of file diff --git a/tools/kal-test/conf/ml/dt/dt_spark.properties b/tools/kal-test/conf/ml/dt/dt_spark.properties new file mode 100644 index 0000000..aeb8452 --- /dev/null +++ b/tools/kal-test/conf/ml/dt/dt_spark.properties @@ -0,0 +1,71 @@ +# Spark parameters + +driverCores=36 +driverMemory=50g +master=yarn +deployMode=client + +maxFailures=1 +compress=false + +# arm +aarch64_classification_higgs_numExectuors=12 +aarch64_classification_higgs_executorCores=23 +aarch64_classification_higgs_executorMemory=79G +aarch64_classification_higgs_extraJavaOptions=-Xms79g + +aarch64_regression_higgs_numExectuors=12 +aarch64_regression_higgs_executorCores=23 +aarch64_regression_higgs_executorMemory=79G +aarch64_regression_higgs_extraJavaOptions=-Xms79g + +aarch64_classification_mnist8m_numExectuors=35 +aarch64_classification_mnist8m_executorCores=8 +aarch64_classification_mnist8m_executorMemory=26G +aarch64_classification_mnist8m_extraJavaOptions=-Xms26g + +aarch64_regression_mnist8m_numExectuors=35 +aarch64_regression_mnist8m_executorCores=8 +aarch64_regression_mnist8m_executorMemory=26G +aarch64_regression_mnist8m_extraJavaOptions=-Xms26g + +aarch64_classification_epsilon_numExectuors=12 +aarch64_classification_epsilon_executorCores=23 +aarch64_classification_epsilon_executorMemory=79G +aarch64_classification_epsilon_extraJavaOptions=-Xms79g + +aarch64_regression_epsilon_numExectuors=12 +aarch64_regression_epsilon_executorCores=23 +aarch64_regression_epsilon_executorMemory=79G +aarch64_regression_epsilon_extraJavaOptions=-Xms79g + +# x86_64 +x86_64_classification_higgs_numExectuors=12 +x86_64_classification_higgs_executorCores=19 +x86_64_classification_higgs_executorMemory=79G +x86_64_classification_higgs_extraJavaOptions=-Xms79g + +x86_64_regression_higgs_numExectuors=12 +x86_64_regression_higgs_executorCores=19 +x86_64_regression_higgs_executorMemory=79G +x86_64_regression_higgs_extraJavaOptions=-Xms79g + +x86_64_classification_mnist8m_numExectuors=12 +x86_64_classification_mnist8m_executorCores=19 +x86_64_classification_mnist8m_executorMemory=79G +x86_64_classification_mnist8m_extraJavaOptions=-Xms79g + +x86_64_regression_mnist8m_numExectuors=12 +x86_64_regression_mnist8m_executorCores=19 +x86_64_regression_mnist8m_executorMemory=79G +x86_64_regression_mnist8m_extraJavaOptions=-Xms79g + +x86_64_classification_epsilon_numExectuors=12 +x86_64_classification_epsilon_executorCores=19 +x86_64_classification_epsilon_executorMemory=79G +x86_64_classification_epsilon_extraJavaOptions=-Xms79g + +x86_64_regression_epsilon_numExectuors=29 +x86_64_regression_epsilon_executorCores=8 +x86_64_regression_epsilon_executorMemory=31G +x86_64_regression_epsilon_extraJavaOptions=-Xms31g \ No newline at end of file diff --git a/tools/kal-test/conf/ml/dtb/dtb.yml b/tools/kal-test/conf/ml/dtb/dtb.yml new file mode 100644 index 0000000..af22edd --- /dev/null +++ b/tools/kal-test/conf/ml/dtb/dtb.yml @@ -0,0 +1,50 @@ +dtb: + opt: + higgs: + genericPt: 276 + pt: 276 + maxDepth: 5 + maxBins: 10000 + maxMemoryInMB: 256 + numCopiesInput: 1 + useNodeIdCache: false + checkpointInterval: 10 + featuresType: array + bcVariables: false + + mnist8m: + genericPt: 276 + pt: 276 + maxDepth: 5 + maxBins: 10000 + maxMemoryInMB: 256 + numCopiesInput: 1 + useNodeIdCache: false + checkpointInterval: 10 + featuresType: array + bcVariables: false + + raw: + higgs: + genericPt: 232 + pt: 116 + maxDepth: 5 + maxBins: 10000 + maxMemoryInMB: 256 + numCopiesInput: 1 + useNodeIdCache: false + checkpointInterval: 10 + featuresType: array + bcVariables: false + + mnist8m: + genericPt: 232 + pt: 232 + maxDepth: 5 + maxBins: 10000 + maxMemoryInMB: 256 + numCopiesInput: 1 + useNodeIdCache: false + checkpointInterval: 10 + featuresType: array + bcVariables: false \ No newline at end of file diff --git a/tools/kal-test/conf/ml/dtb/dtb_spark.properties b/tools/kal-test/conf/ml/dtb/dtb_spark.properties new file mode 100644 index 0000000..3efb478 --- /dev/null +++ b/tools/kal-test/conf/ml/dtb/dtb_spark.properties @@ -0,0 +1,29 @@ +# Spark parameters +master=yarn +deployMode=client +compress=false +driverCores=30 +driverMemory=50G +maxFailures=1 + +# aarch64 +numExectuors_higgs_aarch64=23 +executorCores_higgs_aarch64=8 +executorMemory_higgs_aarch64=39G +extraJavaOptions_higgs_aarch64=-Xms39g + +numExectuors_mnist8m_aarch64=23 +executorCores_mnist8m_aarch64=8 +executorMemory_mnist8m_aarch64=39G +extraJavaOptions_mnist8m_aarch64=-Xms39g + +# x86_64 +numExectuors_higgs_x86_64=35 +executorCores_higgs_x86_64=8 +executorMemory_higgs_x86_64=26G +extraJavaOptions_higgs_x86_64=-Xms26g + +numExectuors_mnist8m_x86_64=35 +executorCores_mnist8m_x86_64=8 +executorMemory_mnist8m_x86_64=26G +extraJavaOptions_mnist8m_x86_64=-Xms26g \ No newline at end of file diff --git a/tools/kal-test/conf/ml/encoder/encoder.yml b/tools/kal-test/conf/ml/encoder/encoder.yml new file mode 100644 index 0000000..6491338 --- /dev/null +++ b/tools/kal-test/conf/ml/encoder/encoder.yml @@ -0,0 +1,20 @@ +#encoder model params + +encoder: + opt: + encoder_400m: + encodeColumns: "7xxx,15xxx" + numThread: 50 + + encoder_800m: + encodeColumns: "7xxx,15xxx" + numThread: 50 + + raw: + encoder_400m: + encodeColumns: "7xxx,15xxx" + numThread: 50 + + encoder_800m: + encodeColumns: "7xxx,15xxx" + numThread: 50 \ No newline at end of file diff --git a/tools/kal-test/conf/ml/encoder/encoder_spark.properties b/tools/kal-test/conf/ml/encoder/encoder_spark.properties new file mode 100644 index 0000000..cce1977 --- /dev/null +++ b/tools/kal-test/conf/ml/encoder/encoder_spark.properties @@ -0,0 +1,33 @@ +# Spark parameters + +master=yarn +deployMode=client +aarch64_encoder_400m_driverCores=36 +aarch64_encoder_400m_driverMemory=50G +aarch64_encoder_400m_numExecutors=47 +aarch64_encoder_400m_executorCores=4 +aarch64_encoder_400m_executorMemory=19G +aarch64_encoder_400m_extraJavaOptions=-Xms19g + +aarch64_encoder_800m_driverCores=36 +aarch64_encoder_800m_driverMemory=50G +aarch64_encoder_800m_numExecutors=47 +aarch64_encoder_800m_executorCores=4 +aarch64_encoder_800m_executorMemory=19G +aarch64_encoder_800m_extraJavaOptions=-Xms19g + +x86_64_encoder_400m_driverCores=36 +x86_64_encoder_400m_driverMemory=50G +x86_64_encoder_400m_numExecutors=35 +x86_64_encoder_400m_executorCores=8 +x86_64_encoder_400m_executorMemory=26G +x86_64_encoder_400m_extraJavaOptions=-Xms26g + + +x86_64_encoder_800m_driverCores=36 +x86_64_encoder_800m_driverMemory=50G +x86_64_encoder_800m_numExecutors=35 +x86_64_encoder_800m_executorCores=8 +x86_64_encoder_800m_executorMemory=26G +x86_64_encoder_800m_extraJavaOptions=-Xms26g + diff --git a/tools/kal-test/conf/ml/fm/fm.yml b/tools/kal-test/conf/ml/fm/fm.yml new file mode 100644 index 0000000..d09b61b --- /dev/null +++ b/tools/kal-test/conf/ml/fm/fm.yml @@ -0,0 +1,136 @@ +#FM model params + +fm: + opt: + classification: + higgs: + pt: 276 + numFeatures: 28 + sparseOrDense: "dense" + regParam: 0.0 + numIterations: 5000 + tolerance: 1E-6 + + epsilon: + pt: 276 + numFeatures: 2000 + sparseOrDense: "dense" + regParam: 0.0 + numIterations: 5000 + tolerance: 1E-6 + + avazu: + pt: 276 + numFeatures: 1000000 + sparseOrDense: "sparse" + regParam: 0.0 + numIterations: 5000 + tolerance: 1E-6 + + kdda: + pt: 276 + numFeatures: 20216830 + sparseOrDense: "sparse" + regParam: 0.0 + numIterations: 5000 + tolerance: 1E-6 + + regression: + higgs: + pt: 276 + numFeatures: 28 + sparseOrDense: "dense" + regParam: 0.0 + numIterations: 5000 + tolerance: 1E-6 + + epsilon: + pt: 276 + numFeatures: 2000 + sparseOrDense: "dense" + regParam: 0.0 + numIterations: 5000 + tolerance: 1E-6 + + avazu: + pt: 276 + numFeatures: 1000000 + sparseOrDense: "sparse" + regParam: 0.0 + numIterations: 5000 + tolerance: 1E-6 + + kdda: + pt: 276 + numFeatures: 20216830 + sparseOrDense: "sparse" + regParam: 0.0 + numIterations: 5000 + tolerance: 1E-6 + + raw: + classification: + higgs: + pt: 276 + numFeatures: 28 + sparseOrDense: "dense" + regParam: 0.0 + numIterations: 5000 + tolerance: 1E-6 + + epsilon: + pt: 276 + numFeatures: 2000 + sparseOrDense: "dense" + regParam: 0.0 + numIterations: 5000 + tolerance: 1E-6 + + avazu: + pt: 276 + numFeatures: 1000000 + sparseOrDense: "sparse" + regParam: 0.0 + numIterations: 5000 + tolerance: 1E-6 + + kdda: + pt: 276 + numFeatures: 20216830 + sparseOrDense: "sparse" + regParam: 0.0 + numIterations: 5000 + tolerance: 1E-6 + + regression: + higgs: + pt: 276 + numFeatures: 28 + sparseOrDense: "dense" + regParam: 0.0 + numIterations: 5000 + tolerance: 1E-6 + + epsilon: + pt: 276 + numFeatures: 2000 + sparseOrDense: "dense" + regParam: 0.0 + numIterations: 5000 + tolerance: 1E-6 + + avazu: + pt: 276 + numFeatures: 1000000 + sparseOrDense: "sparse" + regParam: 0.0 + numIterations: 5000 + tolerance: 1E-6 + + kdda: + pt: 276 + numFeatures: 20216830 + sparseOrDense: "sparse" + regParam: 0.0 + numIterations: 5000 + tolerance: 1E-6 diff --git a/tools/kal-test/conf/ml/fm/fm_spark.properties b/tools/kal-test/conf/ml/fm/fm_spark.properties new file mode 100644 index 0000000..e85b83f --- /dev/null +++ b/tools/kal-test/conf/ml/fm/fm_spark.properties @@ -0,0 +1,89 @@ +# Spark parameters + +driverCores=36 +driverMemory=100g +master=yarn +deployMode=client + + +# arm +aarch64_classification_higgs_numExectuors=12 +aarch64_classification_higgs_executorCores=23 +aarch64_classification_higgs_executorMemory=79G +aarch64_classification_higgs_extraJavaOptions=-Xms79g + +aarch64_regression_higgs_numExectuors=12 +aarch64_regression_higgs_executorCores=23 +aarch64_regression_higgs_executorMemory=79G +aarch64_regression_higgs_extraJavaOptions=-Xms79g + +aarch64_classification_epsilon_numExectuors=12 +aarch64_classification_epsilon_executorCores=23 +aarch64_classification_epsilon_executorMemory=79G +aarch64_classification_epsilon_extraJavaOptions=-Xms79g + +aarch64_regression_epsilon_numExectuors=12 +aarch64_regression_epsilon_executorCores=23 +aarch64_regression_epsilon_executorMemory=79G +aarch64_regression_epsilon_extraJavaOptions=-Xms79g + +aarch64_classification_avazu_numExectuors=12 +aarch64_classification_avazu_executorCores=23 +aarch64_classification_avazu_executorMemory=79G +aarch64_classification_avazu_extraJavaOptions=-Xms79g + +aarch64_regression_avazu_numExectuors=12 +aarch64_regression_avazu_executorCores=23 +aarch64_regression_avazu_executorMemory=79G +aarch64_regression_avazu_extraJavaOptions=-Xms79g + +aarch64_classification_kdda_numExectuors=12 +aarch64_classification_kdda_executorCores=23 +aarch64_classification_kdda_executorMemory=79G +aarch64_classification_kdda_extraJavaOptions=-Xms79g + +aarch64_regression_kdda_numExectuors=12 +aarch64_regression_kdda_executorCores=23 +aarch64_regression_kdda_executorMemory=79G +aarch64_regression_kdda_extraJavaOptions=-Xms79g + +# x86_64 +x86_64_classification_higgs_numExectuors=12 +x86_64_classification_higgs_executorCores=19 +x86_64_classification_higgs_executorMemory=79G +x86_64_classification_higgs_extraJavaOptions=-Xms79g + +x86_64_regression_higgs_numExectuors=12 +x86_64_regression_higgs_executorCores=19 +x86_64_regression_higgs_executorMemory=79G +x86_64_regression_higgs_extraJavaOptions=-Xms79g + +x86_64_classification_epsilon_numExectuors=12 +x86_64_classification_epsilon_executorCores=19 +x86_64_classification_epsilon_executorMemory=79G +x86_64_classification_epsilon_extraJavaOptions=-Xms79g + +x86_64_regression_epsilon_numExectuors=12 +x86_64_regression_epsilon_executorCores=19 +x86_64_regression_epsilon_executorMemory=79G +x86_64_regression_epsilon_extraJavaOptions=-Xms79g + +x86_64_classification_avazu_numExectuors=12 +x86_64_classification_avazu_executorCores=19 +x86_64_classification_avazu_executorMemory=79G +x86_64_classification_avazu_extraJavaOptions=-Xms79g + +x86_64_regression_avazu_numExectuors=12 +x86_64_regression_avazu_executorCores=19 +x86_64_regression_avazu_executorMemory=79G +x86_64_regression_avazu_extraJavaOptions=-Xms79g + +x86_64_classification_kdda_numExectuors=12 +x86_64_classification_kdda_executorCores=19 +x86_64_classification_kdda_executorMemory=79G +x86_64_classification_kdda_extraJavaOptions=-Xms79g + +x86_64_regression_kdda_numExectuors=12 +x86_64_regression_kdda_executorCores=19 +x86_64_regression_kdda_executorMemory=79G +x86_64_regression_kdda_extraJavaOptions=-Xms79g \ No newline at end of file diff --git a/tools/kal-test/conf/ml/fpg/fpg.yml b/tools/kal-test/conf/ml/fpg/fpg.yml new file mode 100644 index 0000000..3f7da63 --- /dev/null +++ b/tools/kal-test/conf/ml/fpg/fpg.yml @@ -0,0 +1,57 @@ +#fpg model params + +fpg: + opt: + Kosarak: + pt: 284 + itemsCol: "items" + minSupport: 0.00085 + minConfidence: 0.8 + optLevel: 2 + timeLimit1: "0.5" + timeLimit2: "0.1" + + Kosarak25: + pt: 236 + itemsCol: "items" + minSupport: 0.001 + minConfidence: 0.8 + optLevel: 2 + timeLimit1: "1.0" + timeLimit2: "0.2" + IBM700: + pt: 236 + itemsCol: "items" + minSupport: 0.001 + minConfidence: 0.8 + optLevel: 1 + timeLimit1: "0.25" + timeLimit2: "" + + + raw: + Kosarak: + pt: 284 + itemsCol: "items" + minSupport: 0.00085 + minConfidence: 0.8 + optLevel: 0 + timeLimit1: "" + timeLimit2: "" + Kosarak25: + pt: 236 + itemsCol: "items" + minSupport: 0.001 + minConfidence: 0.8 + optLevel: 0 + timeLimit1: "" + timeLimit2: "" + IBM700: + pt: 236 + itemsCol: "items" + minSupport: 0.001 + minConfidence: 0.8 + optLevel: 0 + timeLimit1: "" + timeLimit2: "" + diff --git a/tools/kal-test/conf/ml/fpg/fpg_spark.properties b/tools/kal-test/conf/ml/fpg/fpg_spark.properties new file mode 100644 index 0000000..b85e8c1 --- /dev/null +++ b/tools/kal-test/conf/ml/fpg/fpg_spark.properties @@ -0,0 +1,43 @@ +master=yarn +deployMode=client +aarch64_Kosarak_driverCores=36 +aarch64_Kosarak_driverMemory=50G +aarch64_Kosarak_numExecutors=71 +aarch64_Kosarak_executorCores=4 +aarch64_Kosarak_executorMemory=12G +aarch64_Kosarak_extraJavaOptions=-Xms12g + +aarch64_Kosarak25_driverCores=36 +aarch64_Kosarak25_driverMemory=50G +aarch64_Kosarak25_numExecutors=12 +aarch64_Kosarak25_executorCores=23 +aarch64_Kosarak25_executorMemory=79G +aarch64_Kosarak25_extraJavaOptions=-Xms79g + +aarch64_IBM700_driverCores=36 +aarch64_IBM700_driverMemory=50G +aarch64_IBM700_numExecutors=12 +aarch64_IBM700_executorCores=23 +aarch64_IBM700_executorMemory=79G +aarch64_IBM700_extraJavaOptions=-Xms79g + +x86_64_Kosarak_driverCores=36 +x86_64_Kosarak_driverMemory=50G +x86_64_Kosarak_numExecutors=71 +x86_64_Kosarak_executorCores=4 +x86_64_Kosarak_executorMemory=12G +x86_64_Kosarak_extraJavaOptions=-Xms12g + +x86_64_Kosarak25_driverCores=36 +x86_64_Kosarak25_driverMemory=50G +x86_64_Kosarak25_numExecutors=12 +x86_64_Kosarak25_executorCores=23 +x86_64_Kosarak25_executorMemory=79G +x86_64_Kosarak25_extraJavaOptions=-Xms79g + +x86_64_IBM700_driverCores=36 +x86_64_IBM700_driverMemory=50G +x86_64_IBM700_numExecutors=12 +x86_64_IBM700_executorCores=23 +x86_64_IBM700_executorMemory=79G +x86_64_IBM700_extraJavaOptions=-Xms79g \ No newline at end of file diff --git a/tools/kal-test/conf/ml/gbdt/gbdt.yml b/tools/kal-test/conf/ml/gbdt/gbdt.yml new file mode 100644 index 0000000..8894944 --- /dev/null +++ b/tools/kal-test/conf/ml/gbdt/gbdt.yml @@ -0,0 +1,78 @@ +#GBDT model params + +gbdt: + opt: + epsilon: + numPartitions: 228 + maxIter: 100 + maxDepth: 5 + maxBins: 20 + stepSize: 0.1 + cacheNodeIds: true + maxMemoryInMB: 2048 + minInstancesPerNode: 1 + minInfoGain: 0.0 + subsamplingRate: 1.0 + featureSubsetStrategy: "auto" + rcv: + numPartitions: 228 + maxIter: 100 + maxDepth: 5 + maxBins: 20 + stepSize: 0.1 + cacheNodeIds: true + maxMemoryInMB: 2048 + minInstancesPerNode: 1 + minInfoGain: 0.0 + subsamplingRate: 1.0 + featureSubsetStrategy: "auto" + D10M4096libsvm: + numPartitions: 276 + maxIter: 100 + maxDepth: 5 + maxBins: 64 + stepSize: 0.1 + cacheNodeIds: true + maxMemoryInMB: 2048 + minInstancesPerNode: 1 + minInfoGain: 0.0 + subsamplingRate: 1.0 + featureSubsetStrategy: "auto" + + raw: + epsilon: + numPartitions: 66 + maxIter: 100 + maxDepth: 5 + maxBins: 20 + stepSize: 0.1 + cacheNodeIds: true + maxMemoryInMB: 2048 + minInstancesPerNode: 1 + minInfoGain: 0.0 + subsamplingRate: 1.0 + featureSubsetStrategy: "auto" + rcv: + numPartitions: 66 + maxIter: 100 + maxDepth: 5 + maxBins: 20 + stepSize: 0.1 + cacheNodeIds: true + maxMemoryInMB: 2048 + minInstancesPerNode: 1 + minInfoGain: 0.0 + subsamplingRate: 1.0 + featureSubsetStrategy: "auto" + D10M4096libsvm: + numPartitions: 228 + maxIter: 100 + maxDepth: 5 + maxBins: 64 + stepSize: 0.1 + cacheNodeIds: true + maxMemoryInMB: 2048 + minInstancesPerNode: 1 + minInfoGain: 0.0 + subsamplingRate: 1.0 + featureSubsetStrategy: "auto" diff --git a/tools/kal-test/conf/ml/gbdt/gbdt_spark.properties b/tools/kal-test/conf/ml/gbdt/gbdt_spark.properties new file mode 100644 index 0000000..e273621 --- /dev/null +++ b/tools/kal-test/conf/ml/gbdt/gbdt_spark.properties @@ -0,0 +1,16 @@ +# Spark parameters +numExectuors=12 + +epsilon_executorCores_aarch64=19 +epsilon_executorCores_x86_64=19 +rcv_executorCores_aarch64=19 +rcv_executorCores_x86_64=19 +D10M4096libsvm_executorCores_aarch64=23 +D10M4096libsvm_executorCores_x86_64=19 + +executorMemory=77G +extraJavaOptions=-Xms77g +driverCores=40 +driverMemory=50G +master=yarn +deployMode=client \ No newline at end of file diff --git a/tools/kal-test/conf/ml/hdb/hdb.yml b/tools/kal-test/conf/ml/hdb/hdb.yml new file mode 100644 index 0000000..e8b2674 --- /dev/null +++ b/tools/kal-test/conf/ml/hdb/hdb.yml @@ -0,0 +1,28 @@ +#encoder model params + +hdb: + opt: + Hibench1m_100: + pt: 100 + mstPartitionNum: 16 + seed: 23 + saurfangThreshold: 0.65 + + Hibench1m_200: + pt: 100 + mstPartitionNum: 16 + seed: 23 + saurfangThreshold: 0.65 + + raw: + Hibench1m_100: + pt: 100 + mstPartitionNum: 16 + seed: 23 + saurfangThreshold: 0.65 + + Hibench1m_200: + pt: 100 + mstPartitionNum: 16 + seed: 23 + saurfangThreshold: 0.65 diff --git a/tools/kal-test/conf/ml/hdb/hdb_spark.properties b/tools/kal-test/conf/ml/hdb/hdb_spark.properties new file mode 100644 index 0000000..9b257f4 --- /dev/null +++ b/tools/kal-test/conf/ml/hdb/hdb_spark.properties @@ -0,0 +1,31 @@ +# Spark parameters + +master=yarn +deployMode=client +aarch64_Hibench1m_100_driverCores=36 +aarch64_Hibench1m_100_driverMemory=50G +aarch64_Hibench1m_100_numExecutors=35 +aarch64_Hibench1m_100_executorCores=8 +aarch64_Hibench1m_100_executorMemory=26G +aarch64_Hibench1m_100_extraJavaOptions=-Xms5g + +aarch64_Hibench1m_200_driverCores=36 +aarch64_Hibench1m_200_driverMemory=50G +aarch64_Hibench1m_200_numExecutors=35 +aarch64_Hibench1m_200_executorCores=8 +aarch64_Hibench1m_200_executorMemory=26G +aarch64_Hibench1m_200_extraJavaOptions=-Xms5g + +x86_64_Hibench1m_200_driverCores=36 +x86_64_Hibench1m_200_driverMemory=50G +x86_64_Hibench1m_200_numExecutors=35 +x86_64_Hibench1m_200_executorCores=8 +x86_64_Hibench1m_200_executorMemory=26G +x86_64_Hibench1m_200_extraJavaOptions=-Xms5g + +x86_64_Hibench1m_100_driverCores=36 +x86_64_Hibench1m_100_driverMemory=50G +x86_64_Hibench1m_100_numExecutors=35 +x86_64_Hibench1m_100_executorCores=8 +x86_64_Hibench1m_100_executorMemory=26G +x86_64_Hibench1m_100_extraJavaOptions=-Xms5g diff --git a/tools/kal-test/conf/ml/idf/idf.yml b/tools/kal-test/conf/ml/idf/idf.yml new file mode 100644 index 0000000..6a59ee3 --- /dev/null +++ b/tools/kal-test/conf/ml/idf/idf.yml @@ -0,0 +1,25 @@ +#IDF model params + +idf: + opt: + D10m200m: + pt: 0 + combineStrategy: "auto" + fetchMethod: "reduce" + orcFormat: true + D2g250m: + pt: 0 + combineStrategy: "auto" + fetchMethod: "fold" + orcFormat: true + raw: + D10m200m: + pt: 30 + combineStrategy: "not used" + fetchMethod: "not used" + orcFormat: true + D2g250m: + pt: 57 + combineStrategy: "not used" + fetchMethod: "not used" + orcFormat: true \ No newline at end of file diff --git a/tools/kal-test/conf/ml/idf/idf_spark.properties b/tools/kal-test/conf/ml/idf/idf_spark.properties new file mode 100644 index 0000000..57686b6 --- /dev/null +++ b/tools/kal-test/conf/ml/idf/idf_spark.properties @@ -0,0 +1,33 @@ +# Spark parameters + +master=yarn +deployMode=client +aarch64_D10m200m_driverCores=36 +aarch64_D10m200m_driverMemory=50G +aarch64_D10m200m_numExecutors=12 +aarch64_D10m200m_executorCores=23 +aarch64_D10m200m_executorMemory=79G +aarch64_D10m200m_extraJavaOptions=-Xms79g + +aarch64_D2g250m_driverCores=36 +aarch64_D2g250m_driverMemory=50G +aarch64_D2g250m_numExecutors=12 +aarch64_D2g250m_executorCores=23 +aarch64_D2g250m_executorMemory=79G +aarch64_D2g250m_extraJavaOptions=-Xms79g + + +x86_64_D10m200m_driverCores=30 +x86_64_D10m200m_driverMemory=50G +x86_64_D10m200m_numExecutors=12 +x86_64_D10m200m_executorCores=19 +x86_64_D10m200m_executorMemory=79G +x86_64_D10m200m_extraJavaOptions=-Xms79g + +x86_64_D2g250m_driverCores=30 +x86_64_D2g250m_driverMemory=50G +x86_64_D2g250m_numExecutors=12 +x86_64_D2g250m_executorCores=19 +x86_64_D2g250m_executorMemory=79G +x86_64_D2g250m_extraJavaOptions=-Xms79g + diff --git a/tools/kal-test/conf/ml/if/if.yml b/tools/kal-test/conf/ml/if/if.yml new file mode 100644 index 0000000..f6d9b8a --- /dev/null +++ b/tools/kal-test/conf/ml/if/if.yml @@ -0,0 +1,52 @@ +#IF model params + +IF: + opt: + if_40M_1K: + pt: 280 + numTrees: 100 + bootstrap: false + maxInstances: 256 + maxFea: 1.0 + featuresCol: "features" + predictionCol: "predictedLabel" + scoreCol: "anomalyScore" + contamination: 0.1 + randomSeed: 11 + + if_1M_1K: + pt: 280 + numTrees: 100 + bootstrap: false + maxInstances: 256 + maxFea: 1.0 + featuresCol: "features" + predictionCol: "predictedLabel" + scoreCol: "anomalyScore" + contamination: 0.1 + randomSeed: 11 + + raw: + if_40M_1K: + pt: 280 + numTrees: 100 + bootstrap: false + maxInstances: 256 + maxFea: 1.0 + featuresCol: "features" + predictionCol: "predictedLabel" + scoreCol: "anomalyScore" + contamination: 0.1 + randomSeed: 1 + + if_1M_1K: + pt: 280 + numTrees: 100 + bootstrap: false + maxInstances: 256 + maxFea: 1.0 + featuresCol: "features" + predictionCol: "predictedLabel" + scoreCol: "anomalyScore" + contamination: 0.1 + randomSeed: 1 \ No newline at end of file diff --git a/tools/kal-test/conf/ml/if/if_spark.properties b/tools/kal-test/conf/ml/if/if_spark.properties new file mode 100644 index 0000000..8fb78bf --- /dev/null +++ b/tools/kal-test/conf/ml/if/if_spark.properties @@ -0,0 +1,25 @@ +master=yarn +deployMode=client +aarch64_if_40M_1K_driverCores=36 +aarch64_if_40M_1K_driverMemory=50G +aarch64_if_40M_1K_numExecutors=35 +aarch64_if_40M_1K_executorCores=8 +aarch64_if_40M_1K_executorMemory=26G + +aarch64_if_1M_1K_driverCores=36 +aarch64_if_1M_1K_driverMemory=50G +aarch64_if_1M_1K_numExecutors=35 +aarch64_if_1M_1K_executorCores=8 +aarch64_if_1M_1K_executorMemory=26G + +x86_64_if_1M_1K_driverCores=30 +x86_64_if_1M_1K_driverMemory=50G +x86_64_if_1M_1K_numExecutors=29 +x86_64_if_1M_1K_executorCores=8 +x86_64_if_1M_1K_executorMemory=31G + +x86_64_if_40M_1K_driverCores=30 +x86_64_if_40M_1K_driverMemory=50G +x86_64_if_40M_1K_numExecutors=29 +x86_64_if_40M_1K_executorCores=8 +x86_64_if_40M_1K_executorMemory=31G \ No newline at end of file diff --git a/tools/kal-test/conf/ml/kmeans/kmeans.yml b/tools/kal-test/conf/ml/kmeans/kmeans.yml new file mode 100644 index 0000000..3bc75db --- /dev/null +++ b/tools/kal-test/conf/ml/kmeans/kmeans.yml @@ -0,0 +1,47 @@ +#KMeans model params + +kmeans: + D1200M20_aarch64: + numPartitions: 280 + maxIterations: 200 + k: 200 + + D200M20_aarch64: + numPartitions: 280 + maxIterations: 200 + k: 200 + + D200M100_aarch64: + numPartitions: 280 + maxIterations: 200 + k: 200 + + D1200M20_x86_64: + numPartitions: 232 + maxIterations: 200 + k: 200 + + D200M20_x86_64: + numPartitions: 232 + maxIterations: 200 + k: 200 + + D200M100_x86_64: + numPartitions: 232 + maxIterations: 200 + k: 200 + + D1200M20_raw: + numPartitions: 232 + maxIterations: 200 + k: 200 + + D200M20_raw: + numPartitions: 232 + maxIterations: 200 + k: 200 + + D200M100_raw: + numPartitions: 232 + maxIterations: 200 + k: 200 diff --git a/tools/kal-test/conf/ml/kmeans/kmeans_spark.properties b/tools/kal-test/conf/ml/kmeans/kmeans_spark.properties new file mode 100644 index 0000000..535ff97 --- /dev/null +++ b/tools/kal-test/conf/ml/kmeans/kmeans_spark.properties @@ -0,0 +1,18 @@ +# Spark parameters +master=yarn +deployMode=client +driverCores_aarch64=50 +driverMemory_aarch64=36G +numExectuors_aarch64=35 +executorCores_aarch64=8 +executorMemory_aarch64=26G +extraJavaOptions_aarch64=-Xms12g +driverJavaOptions_aarch64=-Xms15g + +driverCores_x86_64=50 +driverMemory_x86_64=30G +numExectuors_x86_64=29 +executorCores_x86_64=8 +executorMemory_x86_64=30G +extraJavaOptions_x86_64=-Xms30g +driverJavaOptions_x86_64=-Xms15g \ No newline at end of file diff --git a/tools/kal-test/conf/ml/knn/knn.yml b/tools/kal-test/conf/ml/knn/knn.yml new file mode 100644 index 0000000..1a7372c --- /dev/null +++ b/tools/kal-test/conf/ml/knn/knn.yml @@ -0,0 +1,59 @@ +#KNN model params + +knn: + opt: + glove: + pt: 280 + k: 100 + testNum: 50000 + testBatchSize: 5000 + topTreeSizeRate: 10.0 + topTreeLeafSize: 10 + subTreeLeafSize: 30 + + gist: + pt: 280 + k: 100 + testNum: 50000 + testBatchSize: 5000 + topTreeSizeRate: 10.0 + topTreeLeafSize: 10 + subTreeLeafSize: 30 + + deep1b: + pt: 280 + k: 100 + testNum: 50000 + testBatchSize: 2000 + topTreeSizeRate: 10.0 + topTreeLeafSize: 10 + subTreeLeafSize: 30 + + + raw: + glove: + pt: 236 + k: 100 + testNum: 50000 + testBatchSize: 5000 + topTreeSizeRate: 10.0 + topTreeLeafSize: 10 + subTreeLeafSize: 100 + + gist: + pt: 232 + k: 100 + testNum: 50000 + testBatchSize: 5000 + topTreeSizeRate: 10.0 + topTreeLeafSize: 10 + subTreeLeafSize: 1000 + + deep1b: + pt: 236 + k: 100 + testNum: 50000 + testBatchSize: 2000 + topTreeSizeRate: 10.0 + topTreeLeafSize: 10 + subTreeLeafSize: 1000 diff --git a/tools/kal-test/conf/ml/knn/knn_spark.properties b/tools/kal-test/conf/ml/knn/knn_spark.properties new file mode 100644 index 0000000..ecedc58 --- /dev/null +++ b/tools/kal-test/conf/ml/knn/knn_spark.properties @@ -0,0 +1,54 @@ +# Spark parameters +master=yarn +deployMode=client +compress=false + +driverCores_glove_aarch64=36 +driverMemory_glove_aarch64=50G +numExectuors_glove_aarch64=35 +executorCores_glove_aarch64=8 +executorMemory_glove_aarch64=26G +extraJavaOptions_glove_aarch64=-Xms26g +execMemOverhead_glove_aarch64=3G + +driverCores_gist_aarch64=36 +driverMemory_gist_aarch64=50G +numExectuors_gist_aarch64=35 +executorCores_gist_aarch64=8 +executorMemory_gist_aarch64=26G +extraJavaOptions_gist_aarch64=-Xms26g +execMemOverhead_gist_aarch64=3G + +driverCores_deep1b_aarch64=36 +driverMemory_deep1b_aarch64=50G +numExectuors_deep1b_aarch64=35 +executorCores_deep1b_aarch64=8 +executorMemory_deep1b_aarch64=26G +extraJavaOptions_deep1b_aarch64=-Xms26g +execMemOverhead_deep1b_aarch64=3G + + + +driverCores_glove_x86_64=30 +driverMemory_glove_x86_64=50G +numExectuors_glove_x86_64=18 +executorCores_glove_x86_64=13 +executorMemory_glove_x86_64=50G +extraJavaOptions_glove_x86_64=-Xms50g +execMemOverhead_glove_x86_64=4G + +driverCores_gist_x86_64=30 +driverMemory_gist_x86_64=50G +numExectuors_gist_x86_64=12 +executorCores_gist_x86_64=19 +executorMemory_gist_x86_64=79G +extraJavaOptions_gist_x86_64=-Xms79g +execMemOverhead_gist_x86_64=5G + +driverCores_deep1b_x86_64=30 +driverMemory_deep1b_x86_64=50G +numExectuors_deep1b_x86_64=18 +executorCores_deep1b_x86_64=13 +executorMemory_deep1b_x86_64=50G +extraJavaOptions_deep1b_x86_64=-Xms50g +execMemOverhead_deep1b_x86_64=5G \ No newline at end of file diff --git a/tools/kal-test/conf/ml/lda/lda.yml b/tools/kal-test/conf/ml/lda/lda.yml new file mode 100644 index 0000000..7261ac0 --- /dev/null +++ b/tools/kal-test/conf/ml/lda/lda.yml @@ -0,0 +1,58 @@ +#LDA model params + +lda: + opt: + nytimes: + numPartitions: 12 + numPartitionsTest: 276 + numFeatures: 102660 + checkpointInterval: -1 + inputDataType: libsvm + maxIter: 100 + k: 100 + + pubmed: + numPartitions: 12 + numPartitionsTest: 276 + numFeatures: 141043 + checkpointInterval: -1 + inputDataType: libsvm + maxIter: 100 + k: 100 + + D20M200K: + numPartitions: 12 + numPartitionsTest: 276 + numFeatures: 200000 + checkpointInterval: -1 + inputDataType: libsvm + maxIter: 100 + k: 100 + + raw: + nytimes: + numPartitions: 96 + numPartitionsTest: 228 + numFeatures: 102660 + checkpointInterval: -1 + inputDataType: libsvm + maxIter: 100 + k: 100 + + pubmed: + numPartitions: 216 + numPartitionsTest: 228 + numFeatures: 141043 + checkpointInterval: -1 + inputDataType: libsvm + maxIter: 100 + k: 100 + + D20M200K: + numPartitions: 228 + numPartitionsTest: 228 + numFeatures: 200000 + checkpointInterval: -1 + inputDataType: libsvm + maxIter: 100 + k: 100 diff --git a/tools/kal-test/conf/ml/lda/lda_spark.properties b/tools/kal-test/conf/ml/lda/lda_spark.properties new file mode 100644 index 0000000..162149d --- /dev/null +++ b/tools/kal-test/conf/ml/lda/lda_spark.properties @@ -0,0 +1,16 @@ +# Spark parameters +master=yarn +deployMode=client +driverCores_aarch64=40 +driverMemory_aarch64=100G +numExectuors_aarch64=12 +executorCores_aarch64=23 +executorMemory_aarch64=75G +extraJavaOptions_aarch64=-Xms75g + +driverCores_x86_64=40 +driverMemory_x86_64=100G +numExectuors_x86_64=12 +executorCores_x86_64=19 +executorMemory_x86_64=75G +extraJavaOptions_x86_64=-Xms75g \ No newline at end of file diff --git a/tools/kal-test/conf/ml/lgbm/lgbm.yml b/tools/kal-test/conf/ml/lgbm/lgbm.yml new file mode 100644 index 0000000..32145ec --- /dev/null +++ b/tools/kal-test/conf/ml/lgbm/lgbm.yml @@ -0,0 +1,294 @@ +#LightGBM model params +lgbm: + opt: + classification: + mnist8m: + objective: "binary" + labelCol: "label" + featuresCol: "features" + verbosity: 0 + eta: 0.1 + max_depth: 6 + max_bin: 16 + num_round: 500 + num_tasks: 60 + min_gain_to_split: 1.0 + lambda_l2: 1.0 + num_leaves: 64 + min_child_weight: 1.0 + min_data_in_leaf: 1 + bagging: 1.0 + bagging_freq: 1 + num_threads: 4 + network_compression: 1 + hist_synch_algo: 2 + logloss_apx: 0 + logloss_apx_eps: 0.0 + loading_balance: "true" + + higgs: + objective: "binary" + labelCol: "label" + featuresCol: "features" + verbosity: 0 + eta: 0.1 + max_depth: 6 + max_bin: 16 + num_round: 500 + num_tasks: 60 + min_gain_to_split: 1.0 + lambda_l2: 1.0 + num_leaves: 64 + min_child_weight: 1.0 + min_data_in_leaf: 1 + bagging: 1.0 + bagging_freq: 1 + num_threads: 4 + network_compression: 0 + hist_synch_algo: 2 + logloss_apx: 0 + logloss_apx_eps: 0.0 + loading_balance: "false" + + criteo: + objective: "binary" + labelCol: "label" + featuresCol: "features" + verbosity: 0 + eta: 0.1 + max_depth: 6 + max_bin: 16 + num_round: 500 + num_tasks: 78 + min_gain_to_split: 1.0 + lambda_l2: 1.0 + num_leaves: 64 + min_child_weight: 1.0 + min_data_in_leaf: 1 + bagging: 1.0 + bagging_freq: 1 + num_threads: 3 + network_compression: 0 + hist_synch_algo: 2 + logloss_apx: 0 + logloss_apx_eps: 0.03 + loading_balance: "false" + + regression: + mnist8m: + objective: "binary" + labelCol: "label" + featuresCol: "features" + verbosity: 0 + eta: 0.1 + max_depth: 6 + max_bin: 16 + num_round: 500 + num_tasks: 60 + min_gain_to_split: 1.0 + lambda_l2: 1.0 + num_leaves: 64 + min_child_weight: 1.0 + min_data_in_leaf: 1 + bagging: 1.0 + bagging_freq: 1 + num_threads: 4 + network_compression: 1 + hist_synch_algo: 2 + logloss_apx: 0 + logloss_apx_eps: 0.0 + loading_balance: "true" + + higgs: + objective: "binary" + labelCol: "label" + featuresCol: "features" + verbosity: 0 + eta: 0.1 + max_depth: 6 + max_bin: 16 + num_round: 500 + num_tasks: 60 + min_gain_to_split: 1.0 + lambda_l2: 1.0 + num_leaves: 64 + min_child_weight: 1.0 + min_data_in_leaf: 1 + bagging: 1.0 + bagging_freq: 1 + num_threads: 4 + network_compression: 0 + hist_synch_algo: 2 + logloss_apx: 0 + logloss_apx_eps: 0.0 + loading_balance: "false" + + criteo: + objective: "binary" + labelCol: "label" + featuresCol: "features" + verbosity: 0 + eta: 0.1 + max_depth: 6 + max_bin: 16 + num_round: 500 + num_tasks: 78 + min_gain_to_split: 1.0 + lambda_l2: 1.0 + num_leaves: 64 + min_child_weight: 1.0 + min_data_in_leaf: 1 + bagging: 1.0 + bagging_freq: 1 + num_threads: 3 + network_compression: 0 + hist_synch_algo: 2 + logloss_apx: 0 + logloss_apx_eps: 0.03 + loading_balance: "false" + raw: + classification: + mnist8m: + objective: "binary" + labelCol: "label" + featuresCol: "features" + verbosity: 0 + eta: 0.1 + max_depth: 6 + max_bin: 16 + num_round: 500 + num_tasks: 59 + min_gain_to_split: 1.0 + lambda_l2: 1.0 + num_leaves: 64 + min_child_weight: 1.0 + min_data_in_leaf: 1 + bagging: 1.0 + bagging_freq: 1 + num_threads: + network_compression: + hist_synch_algo: + logloss_apx: + logloss_apx_eps: + loading_balance: + + higgs: + objective: "binary" + labelCol: "label" + featuresCol: "features" + verbosity: 0 + eta: 0.1 + max_depth: 6 + max_bin: 16 + num_round: 500 + num_tasks: 59 + min_gain_to_split: 1.0 + lambda_l2: 1.0 + num_leaves: 64 + min_child_weight: 1.0 + min_data_in_leaf: 1 + bagging: 1.0 + bagging_freq: 1 + num_threads: + network_compression: + hist_synch_algo: + logloss_apx: + logloss_apx_eps: + loading_balance: + + criteo: + objective: "binary" + labelCol: "label" + featuresCol: "features" + verbosity: 0 + eta: 0.1 + max_depth: 6 + max_bin: 16 + num_round: 500 + num_tasks: 59 + min_gain_to_split: 1.0 + lambda_l2: 1.0 + num_leaves: 64 + min_child_weight: 1.0 + min_data_in_leaf: 1 + bagging: 1.0 + bagging_freq: 1 + num_threads: + network_compression: + hist_synch_algo: + logloss_apx: + logloss_apx_eps: + loading_balance: + + regression: + mnist8m: + objective: "binary" + labelCol: "label" + featuresCol: "features" + verbosity: 0 + eta: 0.1 + max_depth: 6 + max_bin: 16 + num_round: 500 + num_tasks: 59 + min_gain_to_split: 1.0 + lambda_l2: 1.0 + num_leaves: 64 + min_child_weight: 1.0 + min_data_in_leaf: 1 + bagging: 1.0 + bagging_freq: 1 + num_threads: + network_compression: + hist_synch_algo: + logloss_apx: + logloss_apx_eps: + loading_balance: + + higgs: + objective: "binary" + labelCol: "label" + featuresCol: "features" + verbosity: 0 + eta: 0.1 + max_depth: 6 + max_bin: 16 + num_round: 500 + num_tasks: 59 + min_gain_to_split: 1.0 + lambda_l2: 1.0 + num_leaves: 64 + min_child_weight: 1.0 + min_data_in_leaf: 1 + bagging: 1.0 + bagging_freq: 1 + num_threads: + network_compression: + hist_synch_algo: + logloss_apx: + logloss_apx_eps: + loading_balance: + + criteo: + objective: "binary" + labelCol: "label" + featuresCol: "features" + verbosity: 0 + eta: 0.1 + max_depth: 6 + max_bin: 16 + num_round: 500 + num_tasks: 59 + min_gain_to_split: 1.0 + lambda_l2: 1.0 + num_leaves: 64 + min_child_weight: 1.0 + min_data_in_leaf: 1 + bagging: 1.0 + bagging_freq: 1 + num_threads: + network_compression: + hist_synch_algo: + logloss_apx: + logloss_apx_eps: + loading_balance: \ No newline at end of file diff --git a/tools/kal-test/conf/ml/lgbm/lgbm_spark.properties b/tools/kal-test/conf/ml/lgbm/lgbm_spark.properties new file mode 100644 index 0000000..9534063 --- /dev/null +++ b/tools/kal-test/conf/ml/lgbm/lgbm_spark.properties @@ -0,0 +1,51 @@ +# Spark parameters + +master=yarn +deployMode=client +aarch64_mnist8m_driverCores=36 +aarch64_mnist8m_driverMemory=50G +aarch64_mnist8m_numExecutors=60 +aarch64_mnist8m_executorCores=4 +aarch64_mnist8m_executorMemory=15G +aarch64_mnist8m_executorMemOverhead=2G +aarch64_mnist8m_extraJavaOptions=-Xms12g + +aarch64_higgs_driverCores=36 +aarch64_higgs_driverMemory=50G +aarch64_higgs_numExecutors=71 +aarch64_higgs_executorCores=4 +aarch64_higgs_executorMemory=12G +aarch64_higgs_executorMemOverhead=2G +aarch64_higgs_extraJavaOptions=-Xms12g + +aarch64_criteo_driverCores=36 +aarch64_criteo_driverMemory=50G +aarch64_criteo_numExecutors=71 +aarch64_criteo_executorCores=4 +aarch64_criteo_executorMemory=12G +aarch64_criteo_executorMemOverhead=2G +aarch64_criteo_extraJavaOptions=-Xms12g + +x86_64_mnist8m_driverCores=36 +x86_64_mnist8m_driverMemory=50G +x86_64_mnist8m_numExecutors=59 +x86_64_mnist8m_executorCores=4 +x86_64_mnist8m_executorMemory=15G +x86_64_mnist8m_executorMemOverhead=2G +x86_64_mnist8m_extraJavaOptions=-Xms15g + +x86_64_higgs_driverCores=36 +x86_64_higgs_driverMemory=50G +x86_64_higgs_numExecutors=59 +x86_64_higgs_executorCores=4 +x86_64_higgs_executorMemory=15G +x86_64_higgs_executorMemOverhead=2G +x86_64_higgs_extraJavaOptions=-Xms15g + +x86_64_criteo_driverCores=36 +x86_64_criteo_driverMemory=50G +x86_64_criteo_numExecutors=59 +x86_64_criteo_executorCores=4 +x86_64_criteo_executorMemory=15G +x86_64_criteo_executorMemOverhead=2G +x86_64_criteo_extraJavaOptions=-Xms15g \ No newline at end of file diff --git a/tools/kal-test/conf/ml/linR/linR.yml b/tools/kal-test/conf/ml/linR/linR.yml new file mode 100644 index 0000000..4767479 --- /dev/null +++ b/tools/kal-test/conf/ml/linR/linR.yml @@ -0,0 +1,58 @@ +#Linear Regression model params + +linR: + opt: + mnist8m: + pt: 276 + numFeatures: 784 + loss: "huber" + regParam: 0.01 + elasticNetParam: 0.0 + maxIter: 400 + tolerance: 1E-6 + + Twitter: + pt: 276 + numFeatures: 9866 + loss: "squaredError" + regParam: 0.01 + elasticNetParam: 0.0 + maxIter: 400 + tolerance: 1E-6 + + rcv: + pt: 276 + numFeatures: 47236 + loss: "squaredError" + regParam: 0.01 + elasticNetParam: 1.0 + maxIter: 400 + tolerance: 1E-6 + + raw: + mnist8m: + pt: 276 + numFeatures: 784 + loss: "huber" + regParam: 0.01 + elasticNetParam: 0.0 + maxIter: 400 + tolerance: 1E-6 + + Twitter: + pt: 276 + numFeatures: 9866 + loss: "squaredError" + regParam: 0.01 + elasticNetParam: 0.0 + maxIter: 400 + tolerance: 1E-6 + + rcv: + pt: 276 + numFeatures: 47236 + loss: "squaredError" + regParam: 0.01 + elasticNetParam: 1.0 + maxIter: 400 + tolerance: 1E-6 \ No newline at end of file diff --git a/tools/kal-test/conf/ml/linR/linR_spark.properties b/tools/kal-test/conf/ml/linR/linR_spark.properties new file mode 100644 index 0000000..ca066c9 --- /dev/null +++ b/tools/kal-test/conf/ml/linR/linR_spark.properties @@ -0,0 +1,19 @@ +# Spark parameters +master=yarn +deployMode=client +maxFailures=1 +compress=false + +driverCores_aarch64=36 +driverMemory_aarch64=50G +numExectuors_aarch64=12 +executorCores_aarch64=23 +executorMemory_aarch64=79G +extraJavaOptions_aarch64=-Xms79g + +driverCores_x86_64=30 +driverMemory_x86_64=50G +numExectuors_x86_64=12 +executorCores_x86_64=19 +executorMemory_x86_64=79G +extraJavaOptions_x86_64=-Xms79g \ No newline at end of file diff --git a/tools/kal-test/conf/ml/logR/logR.yml b/tools/kal-test/conf/ml/logR/logR.yml new file mode 100644 index 0000000..02399a4 --- /dev/null +++ b/tools/kal-test/conf/ml/logR/logR.yml @@ -0,0 +1,76 @@ +#Logistic Regression model params + +logR: + opt: + mnist8m: + numPartitions: 276 + numLabels: 10 + numFeatures: 784 + regParam: 0.01 + elasticNetParam: 0.0 + maxIter: 400 + tolerance: 1E-6 + isSetBound: true + coefficientLowerBound: -1E32 + coefficientUpperBound: 1E32 + + Twitter: + numPartitions: 276 + numLabels: 1 + numFeatures: 9866 + regParam: 0.01 + elasticNetParam: 0.0 + maxIter: 400 + tolerance: 1E-6 + isSetBound: false + coefficientLowerBound: -1E32 + coefficientUpperBound: 1E32 + + rcv: + numPartitions: 276 + numLabels: 1 + numFeatures: 47236 + regParam: 0.01 + elasticNetParam: 1.0 + maxIter: 400 + tolerance: 1E-6 + isSetBound: false + coefficientLowerBound: -1E32 + coefficientUpperBound: 1E32 + + raw: + mnist8m: + numPartitions: 276 + numLabels: 10 + numFeatures: 784 + regParam: 0.01 + elasticNetParam: 0.0 + maxIter: 400 + tolerance: 1E-6 + isSetBound: true + coefficientLowerBound: -1E32 + coefficientUpperBound: 1E32 + + Twitter: + numPartitions: 276 + numLabels: 1 + numFeatures: 9866 + regParam: 0.01 + elasticNetParam: 0.0 + maxIter: 400 + tolerance: 1E-6 + isSetBound: false + coefficientLowerBound: -1E32 + coefficientUpperBound: 1E32 + + rcv: + numPartitions: 276 + numLabels: 1 + numFeatures: 47236 + regParam: 0.01 + elasticNetParam: 1.0 + maxIter: 400 + tolerance: 1E-6 + isSetBound: false + coefficientLowerBound: -1E32 + coefficientUpperBound: 1E32 \ No newline at end of file diff --git a/tools/kal-test/conf/ml/logR/logR_spark.properties b/tools/kal-test/conf/ml/logR/logR_spark.properties new file mode 100644 index 0000000..0576b9e --- /dev/null +++ b/tools/kal-test/conf/ml/logR/logR_spark.properties @@ -0,0 +1,16 @@ +# Spark parameters +master=yarn +deployMode=client +driverCores_aarch64=36 +driverMemory_aarch64=50G +numExectuors_aarch64=12 +executorCores_aarch64=23 +executorMemory_aarch64=79G +extraJavaOptions_aarch64=-Xms79g + +driverCores_x86_64=30 +driverMemory_x86_64=50G +numExectuors_x86_64=12 +executorCores_x86_64=19 +executorMemory_x86_64=79G +extraJavaOptions_x86_64=-Xms79g \ No newline at end of file diff --git a/tools/kal-test/conf/ml/ml_datasets.properties b/tools/kal-test/conf/ml/ml_datasets.properties new file mode 100644 index 0000000..97b5b03 --- /dev/null +++ b/tools/kal-test/conf/ml/ml_datasets.properties @@ -0,0 +1,102 @@ +# sparkVersion +sparkVersion=spark3.1.1 + +# kalVersion +kalVersion=2.2.0 + +# scalaVersion +scalaVersion=2.12 + +#saveResultPath +saveResultPath=hdfs:///tmp/ml/result + +# data path +epsilon=hdfs:///tmp/ml/dataset/epsilon_train,hdfs:///tmp/ml/dataset/epsilon_test +rcv=hdfs:///tmp/ml/dataset/rcv1bin_train,hdfs:///tmp/ml/dataset/rcv1bin_test +mnist8m=hdfs:///tmp/ml/dataset/mnist8m_train,hdfs:///tmp/ml/dataset/mnist8m_test +higgs=hdfs:///tmp/ml/dataset/higgs_train,hdfs:///tmp/ml/dataset/higgs_test +D10M4096libsvm=hdfs:///tmp/ml/dataset/10M4096libsvm,hdfs:///tmp/ml/dataset/10M4096libsvm +ECBDL14=hdfs:///tmp/ml/dataset/ECBDL14_train.orc,hdfs:///tmp/ml/dataset/ECBDL14_test.orc +D10M4096=hdfs:///tmp/ml/dataset/svm_10m4096_train,hdfs:///tmp/ml/dataset/svm_10m4096_test +D40M1K=hdfs:///tmp/ml/dataset/svm_40m_1k_train,hdfs:///tmp/ml/dataset/svm_40m_1k_test +D280M118=hdfs:///tmp/ml/dataset/svm_280m_118_train,hdfs:///tmp/ml/dataset/svm_280m_118_test +D1200M20=hdfs:///tmp/ml/dataset/kmeans_1200m20 +D200M20=hdfs:///tmp/ml/dataset/kmeans_200m20 +D200M100=hdfs:///tmp/ml/dataset/kmeans_200m100 +Twitter=hdfs:///tmp/ml/dataset/Twitter_train,hdfs:///tmp/ml/dataset/Twitter_test +D10M1K=hdfs:///tmp/ml/dataset/pca_svd_10M1K +D1M10K=hdfs:///tmp/ml/dataset/pca_svd_1M10K +MESH=hdfs:///tmp/ml/dataset/MESH_DEFORM/mesh_deform.mtx +RUCCI=hdfs:///tmp/ml/dataset/RUCCI/Rucci1.mtx +D20M200K=hdfs:///tmp/ml/dataset/lda_20m200k,hdfs:///tmp/ml/dataset/lda_20m200k +nytimes=hdfs:///tmp/ml/dataset/nytimes,hdfs:///tmp/ml/dataset/nytimes +pubmed=hdfs:///tmp/ml/dataset/pubmed,hdfs:///tmp/ml/dataset/pubmed +als=hdfs:///tmp/ml/dataset/ALS +alsbs=hdfs:///tmp/ml/dataset/ALS_bs +alsh=hdfs:///tmp/ml/dataset/ALS_h +glove=hdfs:///tmp/ml/dataset/GloVe +gist=hdfs:///tmp/ml/dataset/GIST +deep1b=hdfs:///tmp/ml/dataset/DEEP1B +kosarak=hdfs:///tmp/ml/dataset/Kosarak +Kosarak=hdfs:///tmp/ml/dataset/Kosarak +Kosarak25=hdfs:///tmp/ml/dataset/Kosarak25k +IBM700=hdfs:///tmp/ml/dataset/IBM_seq_c700it40t9s2npat5corr75 +IBM100M47=hdfs:///tmp/ml/dataset/IBM_Seq_c100000it100t5s10 +IBM10M47=hdfs:///tmp/ml/dataset/IBM_Seq_c10000it100t5s10 +CP10M1K=hdfs:///tmp/ml/dataset/CP10M1K +CP2M5K=hdfs:///tmp/ml/dataset/CP2M5K +CP1M10K=hdfs:///tmp/ml/dataset/CP1M10K +bremenSmall=hdfs:///tmp/ml/dataset/bremenSmall +farm=hdfs:///tmp/ml/dataset/farm +house=hdfs:///tmp/ml/dataset/house +mesh_deform=hdfs:///tmp/hxy_test/sparse_matrix/mesh_deform_234024_9394 +Kemelmacher=hdfs:///tmp/hxy_test/sparse_matrix/Kemelmacher_28453_9694 +wathen100=hdfs:///tmp/hxy_test/sparse_matrix/wathen100_30402_30402 +MOLIERE=hdfs:///tmp/hxy_test/sparse_matrix/MOLIERE_2016_30239688_30239688 +D10m200m=hdfs:///tmp/ml/dataset/idf_10m_200m_orc +D2g250m=hdfs:///tmp/ml/dataset/idf_2g_250m_orc +simrank3w=hdfs:///tmp/wc_test/simrank/HibenchRating3wx3w +simrank5w=hdfs:///tmp/wc_test/simrank/HibenchRating5wx5w +simrank7w=hdfs:///tmp/wc_test/simrank/HibenchRating7wx7w +alibaba_cate=hdfs:///tmp/ml/dataset/w2v/cases/UBA/sentences/cate +alibaba_cate_downstreamTrainFile=hdfs:///tmp/ml/dataset/w2v/cases/UBA/downstreamTrain +alibaba_cate_downstreamTestFile=hdfs:///tmp/ml/dataset/w2v/cases/UBA/downstreamTest +alibaba_item=hdfs:///tmp/ml/dataset/w2v/cases/UBA/sentences/item +alibaba_item_downstreamTrainFile=hdfs:///tmp/ml/dataset/w2v/cases/UBA/downstreamTrain +alibaba_item_downstreamTestFile=hdfs:///tmp/ml/dataset/w2v/cases/UBA/downstreamTest +alibaba_node=hdfs:///tmp/ml/dataset/w2v/cases/UBA/sentences/node +alibaba_node_downstreamTrainFile=hdfs:///tmp/ml/dataset/w2v/cases/UBA/downstreamTrain +alibaba_node_downstreamTestFile=hdfs:///tmp/ml/dataset/w2v/cases/UBA/downstreamTest +alibaba_taobao=hdfs:///tmp/ml/dataset/w2v/product/taobao/sentences +alibaba_taobao_downstreamTrainFile=hdfs:///tmp/ml/dataset/w2v/product/taobao/downstreamTrain +alibaba_taobao_downstreamTestFile=hdfs:///tmp/ml/dataset/w2v/product/taobao/downstreamTest +if_40M_1k=hdfs:///tmp/ml/dataset/40M_1k +if_1M_1k=hdfs:///tmp/ml/dataset/1M_1k +encoder_400m=hdfs:///tmp/ml/dataset/encoder/encoder_400m,./datasets/featureMap_400m.json +encoder_800m=hdfs:///tmp/ml/dataset/encoder/encoder_800m,./datasets/featureMap_800m.json +movielens=hdfs:///tmp/ml/dataset/movielens/movielens_train_with_folds.csv,hdfs:///tmp/ml/dataset/movielens/movielens_test.csv +taobao=hdfs:///tmp/ml/dataset/taobao/taobao_train_with_folds.csv,hdfs:///tmp/ml/dataset/taobao/taobao_test.csv +criteo40m=hdfs:///tmp/ml/dataset/criteo40m/criteo40m_train_with_folds.csv,hdfs:///tmp/ml/dataset/criteo40m/criteo40m_test.csv +criteo150m=hdfs:///tmp/ml/dataset/criteo150m/criteo150m_train_with_folds.csv,hdfs:///tmp/ml/dataset/criteo150m/criteo150m_test.csv +Books=hdfs:///tmp/ml/dataset/nmf/Books.csv +CSJ=hdfs:///tmp/ml/dataset/nmf/Clothing_Shoes_and_Jewelry.csv +MT=hdfs:///tmp/ml/dataset/nmf/Movies_and_TV.csv +BostonHousing=hdfs:///tmp/ml/dataset/BostonHousing.csv +TitanicRf=hdfs:///tmp/ml/dataset/titanic.csv +TitanicGBT=hdfs:///tmp/ml/dataset/titanic.csv +Hibench1m_100="./datasets/kmeans_1m_100" +Hibench1m_200="./datasets/kmeans_1m_200" + +#output data path +D10M1K_output=hdfs:///tmp/ml/output/svd/D10M1K +D1M10K_output=hdfs:///tmp/ml/output/svd/D1M10K +MESH_output=hdfs:///tmp/ml/output/svd/MESH +RUCCI_output=hdfs:///tmp/ml/output/svd/RUCCI +CP10M1K_output=hdfs:///tmp/ml/output/pearson/CP10M1K +CP2M5K_output=hdfs:///tmp/ml/output/pearson/CP2M5K +CP1M10K_output=hdfs:///tmp/ml/output/pearson/CP1M10K +SpearMan_CP10M1K_output=hdfs:///tmp/ml/result/spearman/CP10M1K +SpearMan_CP2M5K_output=hdfs:///tmp/ml/result/spearman/CP2M5K +SpearMan_CP1M10K_output=hdfs:///tmp/ml/result/spearman/CP1M10K +D10m200m_modelsPath="./output/models/D10m200m" +D2g250m_modelsPath="./output/models/D2g250m" diff --git a/tools/kal-test/conf/ml/nmf/nmf.yml b/tools/kal-test/conf/ml/nmf/nmf.yml new file mode 100644 index 0000000..61d4bb1 --- /dev/null +++ b/tools/kal-test/conf/ml/nmf/nmf.yml @@ -0,0 +1,52 @@ +#nmf model params + +nmf: + opt: + MT: + userCol: "user" + itemCol: "item" + ratingCol: "rating" + pt: 250 + rank: 100 + maxIter: 50 + + CSJ: + userCol: "user" + itemCol: "item" + ratingCol: "rating" + pt: 250 + rank: 100 + maxIter: 50 + + Books: + userCol: "user" + itemCol: "item" + ratingCol: "rating" + pt: 250 + rank: 100 + maxIter: 50 + + raw: + MT: + userCol: "user" + itemCol: "item" + ratingCol: "rating" + pt: 250 + rank: 100 + maxIter: 50 + + CSJ: + userCol: "user" + itemCol: "item" + ratingCol: "rating" + pt: 250 + rank: 100 + maxIter: 50 + + Books: + userCol: "user" + itemCol: "item" + ratingCol: "rating" + pt: 250 + rank: 100 + maxIter: 50 \ No newline at end of file diff --git a/tools/kal-test/conf/ml/nmf/nmf_spark.properties b/tools/kal-test/conf/ml/nmf/nmf_spark.properties new file mode 100644 index 0000000..d31fa99 --- /dev/null +++ b/tools/kal-test/conf/ml/nmf/nmf_spark.properties @@ -0,0 +1,51 @@ +# Spark parameters + +master=yarn +deployMode=client +aarch64_MT_driverCores=36 +aarch64_MT_driverMemory=50G +aarch64_MT_numExecutors=71 +aarch64_MT_executorCores=4 +aarch64_MT_executorMemory=12G +aarch64_MT_executorMemOverhead=2G +aarch64_MT_extraJavaOptions=-Xms12g + +aarch64_CSJ_driverCores=36 +aarch64_CSJ_driverMemory=50G +aarch64_CSJ_numExecutors=71 +aarch64_CSJ_executorCores=4 +aarch64_CSJ_executorMemory=12G +aarch64_CSJ_executorMemOverhead=2G +aarch64_CSJ_extraJavaOptions=-Xms12g + +aarch64_Books_driverCores=36 +aarch64_Books_driverMemory=50G +aarch64_Books_numExecutors=71 +aarch64_Books_executorCores=4 +aarch64_Books_executorMemory=12G +aarch64_Books_executorMemOverhead=2G +aarch64_Books_extraJavaOptions=-Xms12g + +x86_64_MT_driverCores=36 +x86_64_MT_driverMemory=50G +x86_64_MT_numExecutors=59 +x86_64_MT_executorCores=4 +x86_64_MT_executorMemory=15G +x86_64_MT_executorMemOverhead=2G +x86_64_MT_extraJavaOptions=-Xms15g + +x86_64_CSJ_driverCores=36 +x86_64_CSJ_driverMemory=50G +x86_64_CSJ_numExecutors=59 +x86_64_CSJ_executorCores=4 +x86_64_CSJ_executorMemory=15G +x86_64_CSJ_executorMemOverhead=2G +x86_64_CSJ_extraJavaOptions=-Xms15g + +x86_64_Books_driverCores=36 +x86_64_Books_driverMemory=50G +x86_64_Books_numExecutors=59 +x86_64_Books_executorCores=4 +x86_64_Books_executorMemory=15G +x86_64_Books_executorMemOverhead=2G +x86_64_Books_extraJavaOptions=-Xms15g \ No newline at end of file diff --git a/tools/kal-test/conf/ml/pca/pca.yml b/tools/kal-test/conf/ml/pca/pca.yml new file mode 100644 index 0000000..3f2af6e --- /dev/null +++ b/tools/kal-test/conf/ml/pca/pca.yml @@ -0,0 +1,53 @@ +#pca model params + +pca: + opt: + D10M1K: + numPartitions: 280 + k: 500 + numCols: 0 + numRows: 0 + dataFormat: dense + sep: "," + + D1M10K: + numPartitions: 285 + k: 500 + numCols: 0 + numRows: 0 + dataFormat: dense + sep: "," + + MESH: + numPartitions: 285 + k: 500 + numCols: 9394 + numRows: 0 + dataFormat: coo + sep: " " + + raw: + D10M1K: + numPartitions: 236 + k: 500 + numCols: 0 + numRows: 0 + dataFormat: dense + sep: "," + + D1M10K: + numPartitions: 234 + k: 500 + numCols: 0 + numRows: 0 + dataFormat: dense + sep: "," + + MESH: + numPartitions: 234 + k: 500 + numCols: 9394 + numRows: 0 + dataFormat: coo + sep: " " + diff --git a/tools/kal-test/conf/ml/pca/pca_spark.properties b/tools/kal-test/conf/ml/pca/pca_spark.properties new file mode 100644 index 0000000..3e664f2 --- /dev/null +++ b/tools/kal-test/conf/ml/pca/pca_spark.properties @@ -0,0 +1,55 @@ +# Spark parameters +master=yarn +deployMode=client + +D10M1K_driverCores_aarch64=50 +D10M1K_driverMemory_aarch64=36G +D10M1K_numExectuors_aarch64=35 +D10M1K_executorCores_aarch64=8 +D10M1K_executorMemory_aarch64=26G +D10M1K_executorMemoryOverhead_aarch64=3G +D10M1K_extraJavaOptions_aarch64=-XX:+UseNUMA + + +D10M1K_driverCores_x86_64=36 +D10M1K_driverMemory_x86_64=50G +D10M1K_numExectuors_x86_64=59 +D10M1K_executorCores_x86_64=4 +D10M1K_executorMemory_x86_64=15G +D10M1K_executorMemoryOverhead_x86_64=2G +D10M1K_extraJavaOptions_x86_64=-XX:+UseNUMA + + +D1M10K_driverCores_aarch64=36 +D1M10K_driverMemory_aarch64=50G +D1M10K_numExectuors_aarch64=15 +D1M10K_executorCores_aarch64=19 +D1M10K_executorMemory_aarch64=61G +D1M10K_executorMemoryOverhead_aarch64=7G +D1M10K_extraJavaOptions_aarch64=-XX:+UseNUMA + +D1M10K_driverCores_x86_64=36 +D1M10K_driverMemory_x86_64=50G +D1M10K_numExectuors_x86_64=18 +D1M10K_executorCores_x86_64=13 +D1M10K_executorMemory_x86_64=50G +D1M10K_executorMemoryOverhead_x86_64=5G +D1M10K_extraJavaOptions_x86_64=-XX:+UseNUMA + + +MESH_driverCores_aarch64=36 +MESH_driverMemory_aarch64=50G +MESH_numExectuors_aarch64=15 +MESH_executorCores_aarch64=19 +MESH_executorMemory_aarch64=61G +MESH_executorMemoryOverhead_aarch64=7G +MESH_extraJavaOptions_aarch64=-Xms61g + +MESH_driverCores_x86_64=36 +MESH_driverMemory_x86_64=50G +MESH_numExectuors_x86_64=18 +MESH_executorCores_x86_64=13 +MESH_executorMemory_x86_64=50G +MESH_executorMemoryOverhead_x86_64=5G +MESH_extraJavaOptions_x86_64=-Xms50g + diff --git a/tools/kal-test/conf/ml/pearson/pearson.yml b/tools/kal-test/conf/ml/pearson/pearson.yml new file mode 100644 index 0000000..0064e81 --- /dev/null +++ b/tools/kal-test/conf/ml/pearson/pearson.yml @@ -0,0 +1,36 @@ +#Pearson model params + +pearson: + opt: + dataframe: + CP10M1K: + pt: 280 + CP2M5K: + pt: 280 + CP1M10K: + pt: 280 + + rdd: + CP10M1K: + pt: 280 + CP2M5K: + pt: 280 + CP1M10K: + pt: 280 + + raw: + dataframe: + CP10M1K: + pt: 234 + CP2M5K: + pt: 234 + CP1M10K: + pt: 234 + + rdd: + CP10M1K: + pt: 234 + CP2M5K: + pt: 234 + CP1M10K: + pt: 234 diff --git a/tools/kal-test/conf/ml/pearson/pearson_spark.properties b/tools/kal-test/conf/ml/pearson/pearson_spark.properties new file mode 100644 index 0000000..166ce32 --- /dev/null +++ b/tools/kal-test/conf/ml/pearson/pearson_spark.properties @@ -0,0 +1,19 @@ +# Spark parameters +master=yarn +deployMode=client + +driverCores_aarch64=36 +driverMemory_aarch64=50G +numExectuors_aarch64=15 +executorCores_aarch64=19 +executorMemory_aarch64=63G +extraJavaOptions_aarch64=-Xms63g +execMemOverhead_aarch64=5G + +driverCores_x86_64=36 +driverMemory_x86_64=50G +numExectuors_x86_64=18 +executorCores_x86_64=13 +executorMemory_x86_64=50G +extraJavaOptions_x86_64=-Xms50g +execMemOverhead_x86_64=5G diff --git a/tools/kal-test/conf/ml/ps/ps.yml b/tools/kal-test/conf/ml/ps/ps.yml new file mode 100644 index 0000000..2810a5f --- /dev/null +++ b/tools/kal-test/conf/ml/ps/ps.yml @@ -0,0 +1,65 @@ +#PrefixSpan model params + +ps: + opt: + kosarak: + numPartitions: 280 + minSupport: 0.001 + maxPatternLength: 10 + maxLocalProjDBSize: 100000000 + localTimeout: '15' + filterCandidates: 'true' + projDBstep: '10' + redistributeData: true + + IBM10M47: + numPartitions: 280 + minSupport: 0.002 + maxPatternLength: 10 + maxLocalProjDBSize: 10000000 + localTimeout: '70' + filterCandidates: 'true' + projDBstep: '10' + redistributeData: false + + IBM100M47: + numPartitions: 280 + minSupport: 0.0021 + maxPatternLength: 10 + maxLocalProjDBSize: 100000000 + localTimeout: '70' + filterCandidates: 'true' + projDBstep: '10' + redistributeData: false + + raw: + kosarak: + numPartitions: 232 + minSupport: 0.001 + maxPatternLength: 10 + maxLocalProjDBSize: 10000000 + localTimeout: None + filterCandidates: None + projDBstep: None + redistributeData: true + + IBM10M47: + numPartitions: 232 + minSupport: 0.002 + maxPatternLength: 10 + maxLocalProjDBSize: 1000000 + localTimeout: None + filterCandidates: None + projDBstep: None + redistributeData: false + + IBM100M47: + numPartitions: 232 + minSupport: 0.0021 + maxPatternLength: 10 + maxLocalProjDBSize: 10000000 + localTimeout: None + filterCandidates: None + projDBstep: None + redistributeData: false + diff --git a/tools/kal-test/conf/ml/ps/ps_spark.properties b/tools/kal-test/conf/ml/ps/ps_spark.properties new file mode 100644 index 0000000..9f4b3cc --- /dev/null +++ b/tools/kal-test/conf/ml/ps/ps_spark.properties @@ -0,0 +1,16 @@ +# Spark parameters +master=yarn +deployMode=client +driverCores_aarch64=36 +driverMemory_aarch64=50G +numExectuors_aarch64=35 +executorCores_aarch64=8 +executorMemory_aarch64=26G +extraJavaOptions_aarch64=-Xms26g + +driverCores_x86_64=30 +driverMemory_x86_64=100G +numExectuors_x86_64=29 +executorCores_x86_64=8 +executorMemory_x86_64=31G +extraJavaOptions_x86_64=-Xms31g \ No newline at end of file diff --git a/tools/kal-test/conf/ml/rf/rf.yml b/tools/kal-test/conf/ml/rf/rf.yml new file mode 100644 index 0000000..3340336 --- /dev/null +++ b/tools/kal-test/conf/ml/rf/rf.yml @@ -0,0 +1,466 @@ +#RF model params + +rf: + opt: + classification: + dataframe: + higgs: + genericPt: 300 + maxMemoryInMB: 2048 + pt: 36 + numCopiesInput: 5 + numTrees: 20 + maxDepth: 17 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "" + featuresType: "array" + mnist8m: + genericPt: 300 + maxMemoryInMB: 2048 + pt: 36 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 10 + bcVariables: false + featureSubsetStrategy: "" + featuresType: "array" + epsilon: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 15 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "" + featuresType: "array" + rcv: + genericPt: 300 + maxMemoryInMB: 16384 + pt: 36 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "0.1" + featuresType: "fasthashmap" + rdd: + higgs: + genericPt: 300 + maxMemoryInMB: 2048 + pt: 36 + numCopiesInput: 5 + numTrees: 20 + maxDepth: 17 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "auto" + featuresType: "array" + mnist8m: + genericPt: 250 + maxMemoryInMB: 2048 + pt: 20 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 10 + bcVariables: false + featureSubsetStrategy: "auto" + featuresType: "array" + epsilon: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 20 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 15 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "auto" + featuresType: "array" + rcv: + genericPt: 220 + maxMemoryInMB: 10240 + pt: 36 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "0.1" + featuresType: "fasthashmap" + regression: + dataframe: + higgs: + genericPt: 300 + maxMemoryInMB: 2048 + pt: 36 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 17 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "" + featuresType: "array" + mnist8m: + genericPt: 300 + maxMemoryInMB: 2048 + pt: 36 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 10 + bcVariables: false + featureSubsetStrategy: "" + featuresType: "array" + epsilon: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 11 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "" + featuresType: "array" + rcv: + genericPt: 220 + maxMemoryInMB: 16384 + pt: 36 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "0.1" + featuresType: "fasthashmap" + rdd: + higgs: + genericPt: 300 + maxMemoryInMB: 2048 + pt: 36 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 17 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "auto" + featuresType: "array" + mnist8m: + genericPt: 300 + maxMemoryInMB: 2048 + pt: 36 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 10 + bcVariables: false + featureSubsetStrategy: "auto" + featuresType: "array" + epsilon: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 20 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 11 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "auto" + featuresType: "array" + rcv: + genericPt: 220 + maxMemoryInMB: 10240 + pt: 36 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "0.1" + featuresType: "fasthashmap" + + raw: + classification: + dataframe: + higgs: + genericPt: 300 + maxMemoryInMB: 2048 + pt: 72 + numCopiesInput: 1 + numTrees: 20 + maxDepth: 17 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "" + featuresType: "array" + mnist8m: + genericPt: 300 + maxMemoryInMB: 1024 + pt: 72 + numCopiesInput: 1 + numTrees: 20 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 10 + bcVariables: false + featureSubsetStrategy: "" + featuresType: "array" + epsilon: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 32 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 15 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "" + featuresType: "array" + rcv: + genericPt: 300 + maxMemoryInMB: 16384 + pt: 32 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "0.1" + featuresType: "fasthashmap" + rdd: + higgs: + genericPt: 300 + maxMemoryInMB: 2048 + pt: 23 + numCopiesInput: 5 + numTrees: 20 + maxDepth: 17 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "auto" + featuresType: "array" + mnist8m: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 22 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 10 + bcVariables: false + featureSubsetStrategy: "auto" + featuresType: "array" + epsilon: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 18 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 15 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "auto" + featuresType: "array" + rcv: + genericPt: 220 + maxMemoryInMB: 10240 + pt: 32 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "0.1" + featuresType: "fasthashmap" + regression: + dataframe: + higgs: + genericPt: 300 + maxMemoryInMB: 2048 + pt: 36 + numCopiesInput: 6 + numTrees: 20 + maxDepth: 17 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "" + featuresType: "array" + mnist8m: + genericPt: 300 + maxMemoryInMB: 2048 + pt: 36 + numCopiesInput: 6 + numTrees: 20 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 10 + bcVariables: false + featureSubsetStrategy: "" + featuresType: "array" + epsilon: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 36 + numCopiesInput: 5 + numTrees: 20 + maxDepth: 11 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "" + featuresType: "array" + rcv: + genericPt: 180 + maxMemoryInMB: 16384 + pt: 32 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "0.1" + featuresType: "fasthashmap" + rdd: + higgs: + genericPt: 300 + maxMemoryInMB: 2048 + pt: 18 + numCopiesInput: 6 + numTrees: 20 + maxDepth: 17 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "auto" + featuresType: "array" + mnist8m: + genericPt: 300 + maxMemoryInMB: 2048 + pt: 22 + numCopiesInput: 6 + numTrees: 20 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 10 + bcVariables: false + featureSubsetStrategy: "auto" + featuresType: "array" + epsilon: + genericPt: 300 + maxMemoryInMB: 4096 + pt: 18 + numCopiesInput: 5 + numTrees: 20 + maxDepth: 11 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "auto" + featuresType: "array" + rcv: + genericPt: 180 + maxMemoryInMB: 10240 + pt: 32 + numCopiesInput: 7 + numTrees: 20 + maxDepth: 13 + maxBins: 128 + useNodeIdCache: false + checkpointInterval: 10 + numClasses: 2 + bcVariables: false + featureSubsetStrategy: "0.1" + featuresType: "fasthashmap" \ No newline at end of file diff --git a/tools/kal-test/conf/ml/rf/rf_spark.properties b/tools/kal-test/conf/ml/rf/rf_spark.properties new file mode 100644 index 0000000..7106467 --- /dev/null +++ b/tools/kal-test/conf/ml/rf/rf_spark.properties @@ -0,0 +1,91 @@ +# Spark parameters + +driverCores=36 +driverMemory=50g +master=yarn +deployMode=client + +maxFailures=1 +compress=false + +# arm +aarch64_classification_higgs_numExectuors=35 +aarch64_classification_higgs_executorCores=8 +aarch64_classification_higgs_executorMemory=26G +aarch64_classification_higgs_extraJavaOptions=-Xms26g + +aarch64_regression_higgs_numExectuors=35 +aarch64_regression_higgs_executorCores=8 +aarch64_regression_higgs_executorMemory=26G +aarch64_regression_higgs_extraJavaOptions=-Xms26g + +aarch64_classification_mnist8m_numExectuors=35 +aarch64_classification_mnist8m_executorCores=8 +aarch64_classification_mnist8m_executorMemory=26G +aarch64_classification_mnist8m_extraJavaOptions=-Xms26g + +aarch64_regression_mnist8m_numExectuors=12 +aarch64_regression_mnist8m_executorCores=23 +aarch64_regression_mnist8m_executorMemory=79G +aarch64_regression_mnist8m_extraJavaOptions=-Xms79g + +aarch64_classification_epsilon_numExectuors=12 +aarch64_classification_epsilon_executorCores=23 +aarch64_classification_epsilon_executorMemory=79G +aarch64_classification_epsilon_extraJavaOptions=-Xms79g + +aarch64_regression_epsilon_numExectuors=35 +aarch64_regression_epsilon_executorCores=8 +aarch64_regression_epsilon_executorMemory=26G +aarch64_regression_epsilon_extraJavaOptions=-Xms26g + +aarch64_classification_rcv_numExectuors=35 +aarch64_classification_rcv_executorCores=8 +aarch64_classification_rcv_executorMemory=26G +aarch64_classification_rcv_extraJavaOptions=-Xms26g + +aarch64_regression_rcv_numExectuors=35 +aarch64_regression_rcv_executorCores=8 +aarch64_regression_rcv_executorMemory=26G +aarch64_regression_rcv_extraJavaOptions=-Xms26g + +# x86_64 +x86_64_classification_higgs_numExectuors=29 +x86_64_classification_higgs_executorCores=8 +x86_64_classification_higgs_executorMemory=31G +x86_64_classification_higgs_extraJavaOptions=-Xms31g + +x86_64_regression_higgs_numExectuors=12 +x86_64_regression_higgs_executorCores=19 +x86_64_regression_higgs_executorMemory=79G +x86_64_regression_higgs_extraJavaOptions=-Xms79g + +x86_64_classification_mnist8m_numExectuors=29 +x86_64_classification_mnist8m_executorCores=8 +x86_64_classification_mnist8m_executorMemory=31G +x86_64_classification_mnist8m_extraJavaOptions=-Xms31g + +x86_64_regression_mnist8m_numExectuors=12 +x86_64_regression_mnist8m_executorCores=19 +x86_64_regression_mnist8m_executorMemory=79G +x86_64_regression_mnist8m_extraJavaOptions=-Xms79g + +x86_64_classification_epsilon_numExectuors=12 +x86_64_classification_epsilon_executorCores=19 +x86_64_classification_epsilon_executorMemory=79G +x86_64_classification_epsilon_extraJavaOptions=-Xms79g + +x86_64_regression_epsilon_numExectuors=12 +x86_64_regression_epsilon_executorCores=19 +x86_64_regression_epsilon_executorMemory=79G +x86_64_regression_epsilon_extraJavaOptions=-Xms79g + +x86_64_classification_rcv_numExectuors=12 +x86_64_classification_rcv_executorCores=19 +x86_64_classification_rcv_executorMemory=79G +x86_64_classification_rcv_extraJavaOptions=-Xms79g + +x86_64_regression_rcv_numExectuors=12 +x86_64_regression_rcv_executorCores=19 +x86_64_regression_rcv_executorMemory=79G +x86_64_regression_rcv_extraJavaOptions=-Xms79g \ No newline at end of file diff --git a/tools/kal-test/conf/ml/simrank/simrank.yml b/tools/kal-test/conf/ml/simrank/simrank.yml new file mode 100644 index 0000000..517f415 --- /dev/null +++ b/tools/kal-test/conf/ml/simrank/simrank.yml @@ -0,0 +1,64 @@ +#simrank model params + +simrank: + opt: + simrank3w-aarch64: + numPartitions: 284 + damp: 0.6 + maxIter: 5 + + simrank5w-aarch64: + numPartitions: 284 + damp: 0.6 + maxIter: 5 + + + simrank7w-aarch64: + numPartitions: 284 + damp: 0.6 + maxIter: 5 + + simrank3w-x86_64: + numPartitions: 236 + damp: 0.6 + maxIter: 5 + + simrank5w-x86_64: + numPartitions: 236 + damp: 0.6 + maxIter: 5 + + simrank7w-x86_64: + numPartitions: 236 + damp: 0.6 + maxIter: 5 + raw: + simrank3w-aarch64: + numPartitions: 284 + damp: 0.6 + maxIter: 5 + + simrank5w-aarch64: + numPartitions: 284 + damp: 0.6 + maxIter: 5 + + simrank7w-aarch64: + numPartitions: 284 + damp: 0.6 + maxIter: 5 + + simrank3w-x86_64: + numPartitions: 236 + damp: 0.6 + maxIter: 5 + + simrank5w-x86_64: + numPartitions: 236 + damp: 0.6 + maxIter: 5 + + simrank7w-x86_64: + numPartitions: 236 + damp: 0.6 + maxIter: 5 \ No newline at end of file diff --git a/tools/kal-test/conf/ml/simrank/simrank_spark.properties b/tools/kal-test/conf/ml/simrank/simrank_spark.properties new file mode 100644 index 0000000..458fa73 --- /dev/null +++ b/tools/kal-test/conf/ml/simrank/simrank_spark.properties @@ -0,0 +1,18 @@ +# Spark parameters +master=yarn +deployMode=client +driverCores_aarch64=36 +driverMemory_aarch64=50G +numExectuors_aarch64=71 +executorCores_aarch64=4 +executorMemory_aarch64=12G +execMemOverhead_aarch64=2G +extraJavaOptions_aarch64=-Xms12g + +driverCores_x86_64=36 +driverMemory_x86_64=50G +numExectuors_x86_64=59 +executorCores_x86_64=4 +executorMemory_x86_64=15G +execMemOverhead_x86_64=2G +extraJavaOptions_x86_64=-Xms15g \ No newline at end of file diff --git a/tools/kal-test/conf/ml/spca/spca.yml b/tools/kal-test/conf/ml/spca/spca.yml new file mode 100644 index 0000000..a876978 --- /dev/null +++ b/tools/kal-test/conf/ml/spca/spca.yml @@ -0,0 +1,75 @@ +#SPCA model params + +spca: + opt: + Kemelmacher: + pt: 284 + k: 500 + sep: " " + dataFormat: "coo" + numCols: 9694 + pcPath: "pc_boostkit_Kemelmacher.txt" + sigmaPath: "sigma_boostkit_Kemelmacher" + + mesh_deform: + pt: 284 + k: 500 + sep: " " + dataFormat: "coo" + numCols: 9394 + pcPath: "pc_boostkit_Mesh.txt" + sigmaPath: "sigma_boostkit_Mesh" + + wathen100: + pt: 284 + k: 500 + sep: " " + dataFormat: "coo" + numCols: 30402 + pcPath: "pc_boostkit_wathen.txt" + sigmaPath: "sigma_boostkit_wathen" + + MOLIERE: + pt: 284 + k: 10 + sep: " " + dataFormat: "coo" + numCols: 30239688 + pcPath: "pc_boostkit_MOLIERE.txt" + sigmaPath: "sigma_boostkit_MOLIERE" + raw: + Kemelmacher: + pt: 236 + k: 500 + sep: " " + dataFormat: "coo" + numCols: 9694 + pcPath: "pc_baseline_Kemelmacher.txt" + sigmaPath: "sigma_baseline_Kemelmacher" + + mesh_deform: + pt: 236 + k: 500 + sep: " " + dataFormat: "coo" + numCols: 9394 + pcPath: "pc_baseline_Mesh.txt" + sigmaPath: "sigma_baseline_Mesh" + + wathen100: + pt: 236 + k: 500 + sep: " " + dataFormat: "coo" + numCols: 30402 + pcPath: "pc_baseline_wathen.txt" + sigmaPath: "sigma_baseline_wathen" + + MOLIERE: + pt: 236 + k: 10 + sep: " " + dataFormat: "coo" + numCols: 30239688 + pcPath: "pc_baseline_MOLIERE.txt" + sigmaPath: "sigma_baseline_MOLIERE" diff --git a/tools/kal-test/conf/ml/spca/spca_spark.properties b/tools/kal-test/conf/ml/spca/spca_spark.properties new file mode 100644 index 0000000..40a372b --- /dev/null +++ b/tools/kal-test/conf/ml/spca/spca_spark.properties @@ -0,0 +1,70 @@ +# Spark parameters +master=yarn +deployMode=client +compress=false + +driverCores_Kemelmacher_aarch64=50 +driverMemory_Kemelmacher_aarch64=50G +numExectuors_Kemelmacher_aarch64=71 +executorCores_Kemelmacher_aarch64=4 +executorMemory_Kemelmacher_aarch64=12G +extraJavaOptions_Kemelmacher_aarch64=-Xms12g +execMemOverhead_Kemelmacher_aarch64=2G + +driverCores_mesh_deform_aarch64=50 +driverMemory_mesh_deform_aarch64=50G +numExectuors_mesh_deform_aarch64=71 +executorCores_mesh_deform_aarch64=4 +executorMemory_mesh_deform_aarch64=12G +extraJavaOptions_mesh_deform_aarch64=-Xms12g +execMemOverhead_mesh_deform_aarch64=2G + +driverCores_wathen100_aarch64=50 +driverMemory_wathen100_aarch64=50G +numExectuors_wathen100_aarch64=71 +executorCores_wathen100_aarch64=4 +executorMemory_wathen100_aarch64=12G +extraJavaOptions_wathen100_aarch64=-Xms12g +execMemOverhead_wathen100_aarch64=2G + +driverCores_MOLIERE_aarch64=50 +driverMemory_MOLIERE_aarch64=50G +numExectuors_MOLIERE_aarch64=71 +executorCores_MOLIERE_aarch64=4 +executorMemory_MOLIERE_aarch64=12G +extraJavaOptions_MOLIERE_aarch64=-Xms12g +execMemOverhead_MOLIERE_aarch64=2G + + + +driverCores_Kemelmacher_x86_64=50 +driverMemory_Kemelmacher_x86_64=50G +numExectuors_Kemelmacher_x86_64=59 +executorCores_Kemelmacher_x86_64=4 +executorMemory_Kemelmacher_x86_64=15G +extraJavaOptions_Kemelmacher_x86_64=-Xms15g +execMemOverhead_Kemelmacher_x86_64=2G + +driverCores_mesh_deform_x86_64=50 +driverMemory_mesh_deform_x86_64=50G +numExectuors_mesh_deform_x86_64=59 +executorCores_mesh_deform_x86_64=4 +executorMemory_mesh_deform_x86_64=15G +extraJavaOptions_mesh_deform_x86_64=-Xms15g +execMemOverhead_mesh_deform_x86_64=2G + +driverCores_wathen100_x86_64=50 +driverMemory_wathen100_x86_64=50G +numExectuors_wathen100_x86_64=59 +executorCores_wathen100_x86_64=4 +executorMemory_wathen100_x86_64=15G +extraJavaOptions_wathen100_x86_64=-Xms15g +execMemOverhead_wathen100_x86_64=2G + +driverCores_MOLIERE_x86_64=50 +driverMemory_MOLIERE_x86_64=50G +numExectuors_MOLIERE_x86_64=59 +executorCores_MOLIERE_x86_64=4 +executorMemory_MOLIERE_x86_64=15G +extraJavaOptions_MOLIERE_x86_64=-Xms15g +execMemOverhead_MOLIERE_x86_64=2G \ No newline at end of file diff --git a/tools/kal-test/conf/ml/spearman/spearman.yml b/tools/kal-test/conf/ml/spearman/spearman.yml new file mode 100644 index 0000000..b25318f --- /dev/null +++ b/tools/kal-test/conf/ml/spearman/spearman.yml @@ -0,0 +1,22 @@ +#Spearman model params + +spearman: + opt: + CP10M1K: + numPartitions: 284 + + CP2M5K: + numPartitions: 284 + + CP1M10K: + numPartitions: 280 + + raw: + CP10M1K: + numPartitions: 2360 + + CP2M5K: + numPartitions: 2360 + + CP1M10K: + numPartitions: 2320 diff --git a/tools/kal-test/conf/ml/spearman/spearman_spark.properties b/tools/kal-test/conf/ml/spearman/spearman_spark.properties new file mode 100644 index 0000000..242d2f3 --- /dev/null +++ b/tools/kal-test/conf/ml/spearman/spearman_spark.properties @@ -0,0 +1,54 @@ +# Spark parameters +master=yarn +deployMode=client +driverCores=36 +driverMemory=50G + +executorCores_CP10M1K=4 +executorCores_CP2M5K=4 +executorCores_CP1M10K=8 + +numExectuors_CP10M1K_aarch64=71 +executorMemory_CP10M1K_aarch64=9G +executorMemOverhead_CP10M1K_aarch64=5G +extraJavaOptions_CP10M1K_aarch64=-Xms9g + +numExectuors_CP2M5K_aarch64=71 +executorMemory_CP2M5K_aarch64=9G +executorMemOverhead_CP2M5K_aarch64=5G +extraJavaOptions_CP2M5K_aarch64=-Xms9g + +numExectuors_CP1M10K_aarch64=35 +executorMemory_CP1M10K_aarch64=18G +executorMemOverhead_CP1M10K_aarch64=10G +extraJavaOptions_CP1M10K_aarch64=-Xms18g + +numExectuors_CP10M1K_x86_64=59 +executorMemory_CP10M1K_x86_64=12G +executorMemOverhead_CP10M1K_x86_64=5G +extraJavaOptions_CP10M1K_x86_64=-Xms12g + +numExectuors_CP2M5K_x86_64=59 +executorMemory_CP2M5K_x86_64=12G +executorMemOverhead_CP2M5K_x86_64=5G +extraJavaOptions_CP2M5K_x86_64=-Xms12g + +numExectuors_CP1M10K_x86_64=29 +executorMemory_CP1M10K_x86_64=23G +executorMemOverhead_CP1M10K_x86_64=10G +extraJavaOptions_CP1M10K_x86_64=-Xms23g + +numExectuors_CP10M1K_raw=59 +executorMemory_CP10M1K_raw=15G +executorMemOverhead_CP10M1K_raw=2G +extraJavaOptions_CP10M1K_raw=-Xms15g + +numExectuors_CP2M5K_raw=59 +executorMemory_CP2M5K_raw=15G +executorMemOverhead_CP2M5K_raw=2G +extraJavaOptions_CP2M5K_raw=-Xms15g + +numExectuors_CP1M10K_raw=29 +executorMemory_CP1M10K_raw=23G +executorMemOverhead_CP1M10K_raw=10G +extraJavaOptions_CP1M10K_raw=-Xms23g \ No newline at end of file diff --git a/tools/kal-test/conf/ml/svd/svd.yml b/tools/kal-test/conf/ml/svd/svd.yml new file mode 100644 index 0000000..471f9dc --- /dev/null +++ b/tools/kal-test/conf/ml/svd/svd.yml @@ -0,0 +1,67 @@ +#SVD model params + +svd: + opt: + D10M1K: + pt: 280 + k: 500 + sep: "," + dataFormat: "dense" + numCols: 0 + numRows: 0 + + D1M10K: + pt: 285 + k: 500 + sep: "," + dataFormat: "dense" + numCols: 0 + numRows: 0 + + MESH: + pt: 284 + k: 500 + sep: " " + dataFormat: "coo" + numCols: 9394 + numRows: 0 + + RUCCI: + pt: 284 + k: 100 + sep: " " + dataFormat: "coo" + numCols: 109901 + numRows: 0 + raw: + D10M1K: + pt: 280 + k: 500 + sep: "," + dataFormat: "dense" + numCols: 0 + numRows: 0 + + D1M10K: + pt: 285 + k: 500 + sep: "," + dataFormat: "dense" + numCols: 0 + numRows: 0 + + MESH: + pt: 284 + k: 500 + sep: " " + dataFormat: "coo" + numCols: 9394 + numRows: 0 + + RUCCI: + pt: 284 + k: 100 + sep: " " + dataFormat: "coo" + numCols: 109901 + numRows: 0 diff --git a/tools/kal-test/conf/ml/svd/svd_spark.properties b/tools/kal-test/conf/ml/svd/svd_spark.properties new file mode 100644 index 0000000..d0c8407 --- /dev/null +++ b/tools/kal-test/conf/ml/svd/svd_spark.properties @@ -0,0 +1,69 @@ +# Spark parameters +master=yarn +deployMode=client +compress=false + +driverCores_D10M1K_aarch64=35 +driverMemory_D10M1K_aarch64=50G +numExectuors_D10M1K_aarch64=35 +executorCores_D10M1K_aarch64=8 +executorMemory_D10M1K_aarch64=26G +extraJavaOptions_D10M1K_aarch64=-XX:+UseNUMA +execMemOverhead_D10M1K_aarch64=3G + +driverCores_D1M10K_aarch64=35 +driverMemory_D1M10K_aarch64=50G +numExectuors_D1M10K_aarch64=15 +executorCores_D1M10K_aarch64=19 +executorMemory_D1M10K_aarch64=61G +extraJavaOptions_D1M10K_aarch64=-XX:+UseNUMA +execMemOverhead_D1M10K_aarch64=7G + +driverCores_MESH_aarch64=35 +driverMemory_MESH_aarch64=50G +numExectuors_MESH_aarch64=71 +executorCores_MESH_aarch64=4 +executorMemory_MESH_aarch64=12G +extraJavaOptions_MESH_aarch64=-Xms12g +execMemOverhead_MESH_aarch64=2G + +driverCores_RUCCI_aarch64=35 +driverMemory_RUCCI_aarch64=50G +numExectuors_RUCCI_aarch64=71 +executorCores_RUCCI_aarch64=4 +executorMemory_RUCCI_aarch64=12G +extraJavaOptions_RUCCI_aarch64=-Xms12g +execMemOverhead_RUCCI_aarch64=2G + + +driverCores_D10M1K_x86_64=30 +driverMemory_D10M1K_x86_64=50G +numExectuors_D10M1K_x86_64=29 +executorCores_D10M1K_x86_64=8 +executorMemory_D10M1K_x86_64=31G +extraJavaOptions_D10M1K_x86_64=-XX:+UseNUMA +execMemOverhead_D10M1K_x86_64=4G + +driverCores_D1M10K_x86_64=30 +driverMemory_D1M10K_x86_64=50G +numExectuors_D1M10K_x86_64=18 +executorCores_D1M10K_x86_64=13 +executorMemory_D1M10K_x86_64=50G +extraJavaOptions_D1M10K_x86_64=-XX:+UseNUMA +execMemOverhead_D1M10K_x86_64=5G + +driverCores_MESH_x86_64=30 +driverMemory_MESH_x86_64=50G +numExectuors_MESH_x86_64=18 +executorCores_MESH_x86_64=13 +executorMemory_MESH_x86_64=50G +extraJavaOptions_MESH_x86_64=-Xms50g +execMemOverhead_MESH_x86_64=5G + +driverCores_RUCCI_x86_64=30 +driverMemory_RUCCI_x86_64=50G +numExectuors_RUCCI_x86_64=18 +executorCores_RUCCI_x86_64=13 +executorMemory_RUCCI_x86_64=50G +extraJavaOptions_RUCCI_x86_64=-Xms50g +execMemOverhead_RUCCI_x86_64=5G \ No newline at end of file diff --git a/tools/kal-test/conf/ml/svm/svm.yml b/tools/kal-test/conf/ml/svm/svm.yml new file mode 100644 index 0000000..c8b3e62 --- /dev/null +++ b/tools/kal-test/conf/ml/svm/svm.yml @@ -0,0 +1,39 @@ +#GBDT model params +svm: + opt: + ECBDL14: + numPartitions: 180 + regParam: 0.01 + maxIter: 1000 + tolerance: 1E-6 + + epsilon: + numPartitions: 180 + regParam: 0.01 + maxIter: 1000 + tolerance: 1E-6 + + rcv: + numPartitions: 180 + regParam: 0.01 + maxIter: 1000 + tolerance: 1E-6 + + raw: + ECBDL14: + numPartitions: 284 + regParam: 0.01 + maxIter: 1000 + tolerance: 1E-6 + + epsilon: + numPartitions: 284 + regParam: 0.01 + maxIter: 1000 + tolerance: 1E-6 + + rcv: + numPartitions: 284 + regParam: 0.01 + maxIter: 1000 + tolerance: 1E-6 diff --git a/tools/kal-test/conf/ml/svm/svm_spark.properties b/tools/kal-test/conf/ml/svm/svm_spark.properties new file mode 100644 index 0000000..fa9295d --- /dev/null +++ b/tools/kal-test/conf/ml/svm/svm_spark.properties @@ -0,0 +1,18 @@ +# Spark parameters +aarch64_numExectuors=12 +aarch64_executorCores=23 +aarch64_executorMemory=79G +aarch64_extraJavaOptions=-Xms79g +aarch64_driverCores=36 +aarch64_driverMemory=50G +aarch64_master=yarn +aarch64_deployMode=client + +x86_64_numExectuors=12 +x86_64_executorCores=19 +x86_64_executorMemory=79G +x86_64_extraJavaOptions=-Xms79g +x86_64_driverCores=30 +x86_64_driverMemory=50G +x86_64_master=yarn +x86_64_deployMode=client \ No newline at end of file diff --git a/tools/kal-test/conf/ml/te/te.yml b/tools/kal-test/conf/ml/te/te.yml new file mode 100644 index 0000000..befa10f --- /dev/null +++ b/tools/kal-test/conf/ml/te/te.yml @@ -0,0 +1,61 @@ +#TargetEncoder model params + +te: + opt: + movielens: + numPartitions: 280 + problemType: "classification" + targetColName: "target" + blendedAvgSmoothing: 20 + blendedAvgInflectionPoint: 10 + + taobao: + numPartitions: 280 + problemType: "classification" + targetColName: "target" + blendedAvgSmoothing: 20 + blendedAvgInflectionPoint: 10 + + criteo40m: + numPartitions: 280 + problemType: "classification" + targetColName: "target" + blendedAvgSmoothing: 20 + blendedAvgInflectionPoint: 10 + + criteo150m: + numPartitions: 280 + problemType: "classification" + targetColName: "target" + blendedAvgSmoothing: 20 + blendedAvgInflectionPoint: 10 + + + raw: + movielens: + numPartitions: 228 + problemType: "classification" + targetColName: "target" + blendedAvgSmoothing: 20 + blendedAvgInflectionPoint: 10 + + taobao: + numPartitions: 228 + problemType: "classification" + targetColName: "target" + blendedAvgSmoothing: 20 + blendedAvgInflectionPoint: 10 + + criteo40m: + numPartitions: 228 + problemType: "classification" + targetColName: "target" + blendedAvgSmoothing: 20 + blendedAvgInflectionPoint: 10 + + criteo150m: + numPartitions: 228 + problemType: "classification" + targetColName: "target" + blendedAvgSmoothing: 20 + blendedAvgInflectionPoint: 10 \ No newline at end of file diff --git a/tools/kal-test/conf/ml/te/te_spark.properties b/tools/kal-test/conf/ml/te/te_spark.properties new file mode 100644 index 0000000..c6c5cf4 --- /dev/null +++ b/tools/kal-test/conf/ml/te/te_spark.properties @@ -0,0 +1,51 @@ +# Spark parameters + +master=yarn +deployMode=client +aarch64_movielens_driverCores=8 +aarch64_movielens_driverMemory=128G +aarch64_movielens_numExecutors=35 +aarch64_movielens_executorCores=6 +aarch64_movielens_executorMemory=26G + +aarch64_taobao_driverCores=8 +aarch64_taobao_driverMemory=128G +aarch64_taobao_numExecutors=35 +aarch64_taobao_executorCores=6 +aarch64_taobao_executorMemory=26G + +aarch64_criteo40m_driverCores=8 +aarch64_criteo40m_driverMemory=128G +aarch64_criteo40m_numExecutors=35 +aarch64_criteo40m_executorCores=6 +aarch64_criteo40m_executorMemory=26G + +aarch64_criteo150m_driverCores=8 +aarch64_criteo150m_driverMemory=128G +aarch64_criteo150m_numExecutors=35 +aarch64_criteo150m_executorCores=6 +aarch64_criteo150m_executorMemory=26G + +x86_64_movielens_driverCores=76 +x86_64_movielens_driverMemory=128G +x86_64_movielens_numExecutors=3 +x86_64_movielens_executorCores=76 +x86_64_movielens_executorMemory=316G + +x86_64_taobao_driverCores=76 +x86_64_taobao_driverMemory=128G +x86_64_taobao_numExecutors=3 +x86_64_taobao_executorCores=76 +x86_64_taobao_executorMemory=316G + +x86_64_criteo40m_driverCores=76 +x86_64_criteo40m_driverMemory=128G +x86_64_criteo40m_numExecutors=3 +x86_64_criteo40m_executorCores=76 +x86_64_criteo40m_executorMemory=316G + +x86_64_criteo150m_driverCores=76 +x86_64_criteo150m_driverMemory=128G +x86_64_criteo150m_numExecutors=3 +x86_64_criteo150m_executorCores=76 +x86_64_criteo150m_executorMemory=316G \ No newline at end of file diff --git a/tools/kal-test/conf/ml/word2vec/word2vec.yml b/tools/kal-test/conf/ml/word2vec/word2vec.yml new file mode 100644 index 0000000..13059ff --- /dev/null +++ b/tools/kal-test/conf/ml/word2vec/word2vec.yml @@ -0,0 +1,182 @@ +word2vec: + opt: + scala2.11: + cate: + eval: "alibaba" + learningRate: 0.025 + numPartitions: 276 + vectorSize: 100 + minCount: 5 + window: 10 + regularization: 0.05 + repetition: 0 + numIterations: 3 + + item: + eval: "alibaba" + learningRate: 0.025 + numPartitions: 282 + vectorSize: 100 + minCount: 5 + window: 10 + regularization: 0.05 + repetition: 0 + numIterations: 1 + + node: + eval: "alibaba" + learningRate: 0.025 + numPartitions: 276 + vectorSize: 100 + minCount: 5 + window: 10 + regularization: 0.05 + repetition: 0 + numIterations: 3 + + taobao: + eval: "taobao" + learningRate: 0.025 + numPartitions: 276 + vectorSize: 100 + minCount: 5 + window: 10 + regularization: 0.05 + repetition: 7 + numIterations: 1 + + scala2.12: + cate: + eval: "alibaba" + learningRate: 0.025 + numPartitions: 276 + vectorSize: 100 + minCount: 5 + window: 10 + regularization: 0.05 + repetition: 0 + numIterations: 3 + + item: + eval: "alibaba" + learningRate: 0.025 + numPartitions: 282 + vectorSize: 100 + minCount: 5 + window: 10 + regularization: 0.05 + repetition: 0 + numIterations: 1 + + node: + eval: "alibaba" + learningRate: 0.025 + numPartitions: 276 + vectorSize: 100 + minCount: 5 + window: 10 + regularization: 0.05 + repetition: 3 + numIterations: 7 + + taobao: + eval: "taobao" + learningRate: 0.025 + numPartitions: 276 + vectorSize: 100 + minCount: 5 + window: 10 + regularization: 0.05 + repetition: 7 + numIterations: 5 + + raw: + scala2.11: + cate: + eval: "alibaba" + learningRate: 0.025 + numPartitions: 228 + vectorSize: 500 + minCount: 5 + window: 10 + regularization: 0 + repetition: 0 + numIterations: 1 + + item: + eval: "alibaba" + learningRate: 0.025 + numPartitions: 228 + vectorSize: 100 + minCount: 5 + window: 10 + regularization: 0 + repetition: 0 + numIterations: 1 + + node: + eval: "alibaba" + learningRate: 0.025 + numPartitions: 228 + vectorSize: 400 + minCount: 5 + window: 10 + regularization: 0 + repetition: 0 + numIterations: 1 + + taobao: + eval: "taobao" + learningRate: 0.025 + numPartitions: 228 + vectorSize: 500 + minCount: 5 + window: 10 + regularization: 0 + repetition: 0 + numIterations: 1 + + scala2.12: + cate: + eval: "alibaba" + learningRate: 0.025 + numPartitions: 228 + vectorSize: 100 + minCount: 5 + window: 10 + regularization: 0 + repetition: 0 + numIterations: 3 + + item: + eval: "alibaba" + learningRate: 0.025 + numPartitions: 228 + vectorSize: 100 + minCount: 5 + window: 10 + regularization: 0 + repetition: 0 + numIterations: 1 + + node: + eval: "alibaba" + learningRate: 0.025 + numPartitions: 228 + vectorSize: 100 + minCount: 5 + window: 10 + regularization: 0 + repetition: 0 + numIterations: 30 + + taobao: + eval: "taobao" + learningRate: 0.025 + numPartitions: 228 + vectorSize: 100 + minCount: 5 + window: 10 + regularization: 0 + repetition: 0 + numIterations: 100 \ No newline at end of file diff --git a/tools/kal-test/conf/ml/word2vec/word2vec_spark.properties b/tools/kal-test/conf/ml/word2vec/word2vec_spark.properties new file mode 100644 index 0000000..4c92bd7 --- /dev/null +++ b/tools/kal-test/conf/ml/word2vec/word2vec_spark.properties @@ -0,0 +1,49 @@ +# Spark parameters +master=yarn +deployMode=client +compress=false +driverCores=50 +driverMemory=50G +execMemOverhead=2G + +# aarch64 +numExectuors_cate_aarch64=12 +executorCores_cate_aarch64=23 +executorMemory_cate_aarch64=79G +extraJavaOptions_cate_aarch64=-Xms79g + +numExectuors_item_aarch64=3 +executorCores_item_aarch64=64 +executorMemory_item_aarch64=315G +extraJavaOptions_item_aarch64=-Xms315g + +numExectuors_node_aarch64=12 +executorCores_node_aarch64=23 +executorMemory_node_aarch64=79G +extraJavaOptions_node_aarch64=-Xms79g + +numExectuors_taobao_aarch64=12 +executorCores_taobao_aarch64=23 +executorMemory_taobao_aarch64=79G +extraJavaOptions_taobao_aarch64=-Xms79g + +# x86_64 +numExectuors_cate_x86_64=12 +executorCores_cate_x86_64=19 +executorMemory_cate_x86_64=79G +extraJavaOptions_cate_x86_64=-Xms79g + +numExectuors_item_x86_64=12 +executorCores_item_x86_64=19 +executorMemory_item_x86_64=79G +extraJavaOptions_item_x86_64=-Xms79g + +numExectuors_node_x86_64=12 +executorCores_node_x86_64=19 +executorMemory_node_x86_64=79G +extraJavaOptions_node_x86_64=-Xms79g + +numExectuors_taobao_x86_64=12 +executorCores_taobao_x86_64=19 +executorMemory_taobao_x86_64=79G +extraJavaOptions_taobao_x86_64=-Xms79g \ No newline at end of file diff --git a/tools/kal-test/conf/ml/xgbt/xgbt.yml b/tools/kal-test/conf/ml/xgbt/xgbt.yml new file mode 100644 index 0000000..367a343 --- /dev/null +++ b/tools/kal-test/conf/ml/xgbt/xgbt.yml @@ -0,0 +1,252 @@ +#XGBT model params +xgbt: + aarch64: + opt: + classification: + higgs: + eta: 0.1 + gamma: 1 + min_child_weight: 1 + max_depth: 6 + allow_non_zero_for_missing: true + vectorType: "sparse" + enable_bbgen: true + rabit_enable_tcp_no_delay: true + objective: "binary:logistic" + num_round: 500 + num_workers: 51 + nthread: 4 + tree_method: "hist" + grow_policy: "depthwiselossltd" + mnist8m: + eta: 0.1 + gamma: 1 + min_child_weight: 1 + max_depth: 6 + allow_non_zero_for_missing: true + vectorType: "sparse" + enable_bbgen: true + rabit_enable_tcp_no_delay: true + objective: "multi:softprob" + num_round: 500 + num_class: 10 + num_workers: 24 + nthread: 9 + tree_method: "hist" + grow_policy: "depthwiselossltd" + regression: + higgs: + eta: 0.1 + gamma: 1 + min_child_weight: 1 + max_depth: 6 + allow_non_zero_for_missing: true + vectorType: "sparse" + enable_bbgen: true + rabit_enable_tcp_no_delay: true + objective: "reg:squarederror" + num_round: 500 + num_workers: 51 + nthread: 4 + tree_method: "hist" + grow_policy: "depthwiselossltd" + mnist8m: + eta: 0.1 + gamma: 1 + min_child_weight: 1 + max_depth: 6 + allow_non_zero_for_missing: true + vectorType: "sparse" + enable_bbgen: true + rabit_enable_tcp_no_delay: true + objective: "reg:squarederror" + num_round: 500 + num_workers: 120 + nthread: 2 + tree_method: "hist" + grow_policy: "depthwiselossltd" + raw: + classification: + higgs: + eta: 0.1 + gamma: 1 + min_child_weight: 1 + max_depth: 6 + allow_non_zero_for_missing: true + vectorType: "sparse" + enable_bbgen: true + rabit_enable_tcp_no_delay: true + objective: "binary:logistic" + num_round: 500 + num_workers: 51 + nthread: 4 + tree_method: "hist" + grow_policy: "depthwise" + mnist8m: + eta: 0.1 + gamma: 1 + min_child_weight: 1 + max_depth: 6 + allow_non_zero_for_missing: true + vectorType: "sparse" + enable_bbgen: true + rabit_enable_tcp_no_delay: true + objective: "multi:softprob" + num_round: 500 + num_class: 10 + num_workers: 24 + nthread: 9 + tree_method: "hist" + grow_policy: "depthwise" + regression: + higgs: + eta: 0.1 + gamma: 1 + min_child_weight: 1 + max_depth: 6 + allow_non_zero_for_missing: true + vectorType: "sparse" + enable_bbgen: true + rabit_enable_tcp_no_delay: true + objective: "reg:squarederror" + num_round: 500 + num_workers: 51 + nthread: 4 + tree_method: "hist" + grow_policy: "depthwise" + mnist8m: + eta: 0.1 + gamma: 1 + min_child_weight: 1 + max_depth: 6 + allow_non_zero_for_missing: true + vectorType: "sparse" + enable_bbgen: true + rabit_enable_tcp_no_delay: true + objective: "reg:squarederror" + num_round: 500 + num_workers: 120 + nthread: 2 + tree_method: "hist" + grow_policy: "depthwise" + x86_64: + raw: + classification: + higgs: + eta: 0.1 + gamma: 1 + min_child_weight: 1 + max_depth: 6 + allow_non_zero_for_missing: true + vectorType: "sparse" + objective: "binary:logistic" + num_round: 500 + num_workers: 33 + nthread: 7 + tree_method: "hist" + grow_policy: "depthwise" + mnist8m: + eta: 0.1 + gamma: 1 + min_child_weight: 1 + max_depth: 6 + allow_non_zero_for_missing: true + vectorType: "sparse" + objective: "multi:softprob" + num_round: 500 + num_class: 10 + num_workers: 24 + nthread: 9 + tree_method: "hist" + grow_policy: "depthwise" + regression: + higgs: + eta: 0.1 + gamma: 1 + min_child_weight: 1 + max_depth: 6 + allow_non_zero_for_missing: true + vectorType: "sparse" + objective: "reg:squarederror" + num_round: 500 + num_workers: 33 + nthread: 7 + tree_method: "hist" + grow_policy: "depthwise" + mnist8m: + eta: 0.1 + gamma: 1 + min_child_weight: 1 + max_depth: 6 + allow_non_zero_for_missing: true + vectorType: "sparse" + objective: "reg:squarederror" + num_round: 500 + num_workers: 33 + nthread: 7 + tree_method: "hist" + grow_policy: "depthwise" + opt: + classification: + higgs: + eta: 0.1 + gamma: 1 + min_child_weight: 1 + max_depth: 6 + allow_non_zero_for_missing: true + vectorType: "sparse" + enable_bbgen: true + rabit_enable_tcp_no_delay: true + objective: "binary:logistic" + num_round: 500 + num_workers: 33 + nthread: 7 + tree_method: "hist" + grow_policy: "depthwiselossltd" + mnist8m: + eta: 0.1 + gamma: 1 + min_child_weight: 1 + max_depth: 6 + allow_non_zero_for_missing: true + vectorType: "sparse" + enable_bbgen: true + rabit_enable_tcp_no_delay: true + objective: "multi:softprob" + num_round: 500 + num_class: 10 + num_workers: 18 + nthread: 9 + tree_method: "hist" + grow_policy: "depthwiselossltd" + regression: + higgs: + eta: 0.1 + gamma: 1 + min_child_weight: 1 + max_depth: 6 + allow_non_zero_for_missing: true + vectorType: "sparse" + enable_bbgen: true + rabit_enable_tcp_no_delay: true + objective: "reg:squarederror" + num_round: 500 + num_workers: 33 + nthread: 7 + tree_method: "hist" + grow_policy: "depthwiselossltd" + mnist8m: + eta: 0.1 + gamma: 1 + min_child_weight: 1 + max_depth: 6 + allow_non_zero_for_missing: true + vectorType: "sparse" + enable_bbgen: true + rabit_enable_tcp_no_delay: true + objective: "reg:squarederror" + num_round: 500 + num_workers: 33 + nthread: 7 + tree_method: "hist" + grow_policy: "depthwiselossltd" \ No newline at end of file diff --git a/tools/kal-test/conf/ml/xgbt/xgbt_spark.properties b/tools/kal-test/conf/ml/xgbt/xgbt_spark.properties new file mode 100644 index 0000000..bcd3299 --- /dev/null +++ b/tools/kal-test/conf/ml/xgbt/xgbt_spark.properties @@ -0,0 +1,77 @@ +# Spark parameters + +master=yarn +deployMode=client + +x86_64_classification_higgs_driverCores=40 +x86_64_classification_higgs_driverMemory=300G +x86_64_classification_higgs_numExecutors=33 +x86_64_classification_higgs_executorCores=7 +x86_64_classification_higgs_taskCpus=7 +x86_64_classification_higgs_executorMemory=27G +x86_64_classification_higgs_extraJavaOptions=-Xms27g +x86_64_classification_higgs_numPartitions=33 + +aarch64_classification_higgs_driverCores=40 +aarch64_classification_higgs_driverMemory=300G +aarch64_classification_higgs_numExecutors=51 +aarch64_classification_higgs_executorCores=4 +aarch64_classification_higgs_taskCpus=4 +aarch64_classification_higgs_executorMemory=17G +aarch64_classification_higgs_extraJavaOptions=-Xms17g +aarch64_classification_higgs_numPartitions=51 + +x86_64_classification_mnist8m_driverCores=40 +x86_64_classification_mnist8m_driverMemory=300G +x86_64_classification_mnist8m_numExecutors=18 +x86_64_classification_mnist8m_executorCores=9 +x86_64_classification_mnist8m_taskCpus=9 +x86_64_classification_mnist8m_executorMemory=50G +x86_64_classification_mnist8m_extraJavaOptions=-Xms50g +x86_64_classification_mnist8m_numPartitions=18 + +aarch64_classification_mnist8m_driverCores=40 +aarch64_classification_mnist8m_driverMemory=300G +aarch64_classification_mnist8m_numExecutors=24 +aarch64_classification_mnist8m_executorCores=9 +aarch64_classification_mnist8m_taskCpus=9 +aarch64_classification_mnist8m_executorMemory=37G +aarch64_classification_mnist8m_extraJavaOptions=-Xms37g +aarch64_classification_mnist8m_numPartitions=24 + + +x86_64_regression_higgs_driverCores=40 +x86_64_regression_higgs_driverMemory=300G +x86_64_regression_higgs_numExecutors=33 +x86_64_regression_higgs_executorCores=7 +x86_64_regression_higgs_taskCpus=7 +x86_64_regression_higgs_executorMemory=27G +x86_64_regression_higgs_extraJavaOptions=-Xms27g +x86_64_regression_higgs_numPartitions=33 + +aarch64_regression_higgs_driverCores=40 +aarch64_regression_higgs_driverMemory=300G +aarch64_regression_higgs_numExecutors=51 +aarch64_regression_higgs_executorCores=4 +aarch64_regression_higgs_taskCpus=4 +aarch64_regression_higgs_executorMemory=17G +aarch64_regression_higgs_extraJavaOptions=-Xms17g +aarch64_regression_higgs_numPartitions=51 + +x86_64_regression_mnist8m_driverCores=40 +x86_64_regression_mnist8m_driverMemory=300G +x86_64_regression_mnist8m_numExecutors=33 +x86_64_regression_mnist8m_executorCores=7 +x86_64_regression_mnist8m_taskCpus=7 +x86_64_regression_mnist8m_executorMemory=27G +x86_64_regression_mnist8m_extraJavaOptions=-Xms27g +x86_64_regression_mnist8m_numPartitions=33 + +aarch64_regression_mnist8m_driverCores=40 +aarch64_regression_mnist8m_driverMemory=300G +aarch64_regression_mnist8m_numExecutors=120 +aarch64_regression_mnist8m_executorCores=2 +aarch64_regression_mnist8m_taskCpus=2 +aarch64_regression_mnist8m_executorMemory=7G +aarch64_regression_mnist8m_extraJavaOptions=-Xms7g +aarch64_regression_mnist8m_numPartitions=120 diff --git a/tools/kal-test/pom.xml b/tools/kal-test/pom.xml new file mode 100644 index 0000000..f0b4e30 --- /dev/null +++ b/tools/kal-test/pom.xml @@ -0,0 +1,129 @@ + + 4.0.0 + com.bigdata + kal-test_${scala.version} + 0.1 + ${project.artifactId} + Spark KAL algorithm test + 2020 + jar + + + 1.8 + 1.8 + UTF-8 + 2.12 + 2.2.0 + 3.1.1 + + + + + ai.h2o + sparkling-water-core_2.11 + 3.34.0.6-1-2.4 + + + com.google.guava + guava + + + + + com.microsoft.ml.spark + mmlspark_2.12_spark3.1.2 + 0.0.0+79-09152193 + + + ai.h2o + sparkling-water-ml_2.11 + 3.34.0.6-1-2.4 + + + ml.dmlc + xgboost4j-spark_2.12 + 1.3.1 + + + org.apache.spark.graphx.lib + boostkit-graph-kernel-client_${scala.version} + ${kal.version} + spark${spark.version} + + + org.apache.spark + boostkit-ml-kernel-client_${scala.version} + ${kal.version} + spark${spark.version} + + + org.apache.spark + spark-graphx_${scala.version} + ${spark.version} + + + org.apache.spark + spark-mllib_${scala.version} + ${spark.version} + + + it.unimi.dsi + fastutil + 8.3.1 + + + com.github.haifengl + smile-math + 2.5.2 + + + com.github.haifengl + smile-core + 2.5.2 + + + com.linkedin.isolation-forest + isolation-forest_3.1.1_2.12 + 3.0.0 + + + org.yaml + snakeyaml + 1.19 + + + org.apache.mahout + mahout-core + 0.9 + + + org.apache.hadoop + hadoop-core + + + + + + src/main/scala + + + net.alchim31.maven + scala-maven-plugin + 3.2.0 + + + + compile + + + + -dependencyfile + ${project.build.directory}/.scala_dependencies + + + + + + + + diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/BFSVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/BFSVerify.scala new file mode 100644 index 0000000..4378b1f --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/BFSVerify.scala @@ -0,0 +1,30 @@ +package com.bigdata.compare.graph + +import org.apache.spark.{SparkConf, SparkContext} + +object BFSVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val split = ";" + val sparkConf = new SparkConf().setAppName("BFSVerify") + val sc = SparkContext.getOrCreate(sparkConf) + val rdd0 = sc.textFile(path0).map(f => { + val arr = f.split(split) + (arr(0).toLong, arr(1).split(",").sorted, arr(2).toInt) + }).map(f => (f._1, f._2.mkString({","}) + f._3)) + val rdd1 = sc.textFile(path1).map(f => { + val arr = f.split(split) + (arr(0).toLong, arr(1).split(",").sorted, arr(2).toInt) + }).map(f => (f._1, f._2.mkString({","}) + f._3)) + + val cnt0 = rdd0.count() + val cnt1 = rdd1.count() + val statisticR = rdd0.join(rdd1).mapValues(f => f._1 == f._2) + val joinCnt = statisticR.count() + val flag = cnt0 == cnt1 && cnt0 == joinCnt + sc.stop() + println(s"Static Nodes Count: $joinCnt, $cnt0, $cnt1") + println(s"The algorithm is correct: $flag") + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/BetweennessClosenessVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/BetweennessClosenessVerify.scala new file mode 100644 index 0000000..7e6837c --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/BetweennessClosenessVerify.scala @@ -0,0 +1,33 @@ +package com.bigdata.compare.graph + +import com.bigdata.graph.Util + +import org.apache.spark.{SparkConf, SparkContext} + +object BetweennessClosenessVerify { + def main(args: Array[String]): Double = { + val groundTruthPath = args(0) + val output = args(1) + val numPartitions = 232 + val sparkConf = new SparkConf().setAppName("BetweennessClosenessVerify") + val sc = SparkContext.getOrCreate(sparkConf) + val groundTruthSet = Util + .readTopKResultFromHDFS(sc, groundTruthPath, ",", numPartitions) + .collect() + .toSet + val resultSet = Util + .readTopKResultFromHDFS(sc, output, ",", numPartitions) + .collect() + .toSet + val accuracy = groundTruthSet + .intersect(resultSet) + .size + .toDouble / groundTruthSet.size + val flag = accuracy >= 0.93 + + sc.stop() + println(s"Accuracy: ${accuracy}") + println(s"The algorithm is correct: ${flag}") + return accuracy + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/CCVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/CCVerify.scala new file mode 100644 index 0000000..8d9d801 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/CCVerify.scala @@ -0,0 +1,28 @@ +package com.bigdata.compare.graph + +import org.apache.spark.{SparkConf, SparkContext} + +object CCVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val split = "," + val sparkConf = new SparkConf().setAppName("CCVerify") + val sc = SparkContext.getOrCreate(sparkConf) + val rdd0 = sc.textFile(path0).map(s => { + val arr = s.split(split) + arr(1) + }).cache() + val rdd1 = sc.textFile(path1).map(s => { + val arr = s.split(split) + arr(1) + }).cache() + + val cnt0 = rdd0.count() + val cnt1 = rdd1.count() + val flag = cnt0 == cnt1 + sc.stop() + println(s"Static Nodes Count: $cnt0, $cnt1") + println(s"The algorithm is correct: ${flag}") + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/CDDegreeVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/CDDegreeVerify.scala new file mode 100644 index 0000000..c765c09 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/CDDegreeVerify.scala @@ -0,0 +1,20 @@ +// scalastyle:off println +package com.bigdata.compare.graph + +import org.apache.spark.{SparkConf, SparkContext} + +object CDDegreeVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val sparkConf = new SparkConf().setAppName("CDDegreeVerify") + val sc = SparkContext.getOrCreate(sparkConf) + val rdd0 = sc.textFile(path0).collect() + val rdd1 = sc.textFile(path1).collect() + + val flag = rdd0.diff(rdd1).length == 0 + sc.stop() + println(s"Static Nodes Count: ${rdd0.length}, ${rdd1.length}") + println(s"The algorithm is correct: ${flag}") + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/ClusteringCoefficientTCVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/ClusteringCoefficientTCVerify.scala new file mode 100644 index 0000000..1462c0a --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/ClusteringCoefficientTCVerify.scala @@ -0,0 +1,28 @@ +package com.bigdata.compare.graph + +import com.bigdata.graph.Util + +import org.apache.spark.{SparkConf, SparkContext} + +object ClusteringCoefficientTCVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val split = "," + val partNum = 1000 + val sparkConf = new SparkConf().setAppName("ClusteringCoefficientTCVerify") + val sc = SparkContext.getOrCreate(sparkConf) + val rdd0 = Util.readEdgeFileFromHDFS(sc, path0, split, partNum) + val rdd1 = Util.readEdgeFileFromHDFS(sc, path1, split, partNum) + val res = rdd1.union(rdd0).reduceByKey((a, b) => { + Math.abs(a - b) + }).filter(f => f._2 > 1e-7) + + val cnt0 = rdd0.count() + val cnt1 = rdd1.count() + val flag = res.count() == 0 + sc.stop() + println(s"Static Nodes Count: $cnt0, $cnt1") + println(s"The algorithm is correct: ${flag}") + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/DeepWalkVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/DeepWalkVerify.scala new file mode 100644 index 0000000..3d223fb --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/DeepWalkVerify.scala @@ -0,0 +1,156 @@ +/* +// scalastyle:off + +package com.bigdata.compare.graph + +import java.io.InputStreamReader + +import scala.collection.Map +import scala.util.Try +import com.bigdata.utils.Utils +import com.bigdata.graph.{DeepWalkConfig, DeepWalkParams} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.apache.spark.graphx.VertexId +import org.apache.spark.ml.linalg.{DenseVector, Vector} +import org.apache.spark.rdd.RDD +import org.apache.spark.{SparkConf, SparkContext} +import org.yaml.snakeyaml.representer.Representer +import smile.math.MathEx.cos +import smile.validation.AUC + +object DeepWalkVerify { + + def readEdgeListFromHDFS( + sc: SparkContext, + filePath: String, + split: String, + partition: Int): RDD[(VertexId, VertexId, Double)] = { + sc.textFile(filePath, partition) + .flatMap(line => { + if (line.startsWith("#")) { + Iterator.empty + } + else { + val x = line.trim.split(split) + if (x.length < 2) { + Iterator.empty + } + else { + var w = x(2).toDouble + Iterator.single((x(0).toLong, x(1).toLong, w)) + } + } + }) + } + + def readUndirectEdgeFromHDFS( + sc: SparkContext, + filePath: String, + split: String, + partition: Int): RDD[(Long, Long)] = { + sc.textFile(filePath, partition) + .flatMap(line => { + if (line.startsWith("#")) { + Iterator.empty + } else { + val x = line.split(split) + if (x.length < 2) { + Iterator.empty + } else { + val node1 = x(0).toLong + val node2 = x(1).toLong + Iterator.single((node1, node2)) + } + } + }) + } + + def readNode2VecModel(sc: SparkContext, input: String): RDD[(Long, Vector)] = { + val rdd: RDD[(Long, Vector)] = sc + .textFile(input) + .mapPartitions(it => { + val regexp = "([0-9]+) \\((.*)\\)".r + it.map { case regexp(u, emb) => (u.toLong, new DenseVector(emb.split(",") + .map(_.toDouble)): Vector) + } + }).cache() + rdd + } + + def get(modelRDD: RDD[(Long, Vector)]): Map[Long, Vector] = { + modelRDD.collectAsMap() + } + + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, platformName) = (modelConfSplit(0), modelConfSplit(1)) + val graphPath = args(1) + val negEdgePath = args(2) + val embeddingPath = args(3) + val isRaw = args(4) + + val representer = new Representer + representer.addClassTag(classOf[DeepWalkParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val stream: InputStreamReader = Utils.getStream("conf/graph/deepwalk/deepwalk.yml") + val yaml = new Yaml(new Constructor(classOf[DeepWalkConfig]), representer, options) + val description = new TypeDescription(classOf[DeepWalkParams]) + yaml.addTypeDescription(description) + val config: DeepWalkConfig = yaml.load(stream).asInstanceOf[DeepWalkConfig] + val paramsMap = + config.deepwalk.get(datasetName).get(isRaw match { + case "no" => "opt" + case _ => "raw" + }) + + val params = new DeepWalkParams() + + params.setDatasetName(datasetName) + params.setPartitions(paramsMap.get("partitions").toString.toInt) + params.setSplitGraph(paramsMap.get("splitGraph").toString) + params.setAlgorithmName("DeepWalk") + params.setTestcaseType(s"DeepWalk_${datasetName}") + + val sparkConf = new SparkConf().setAppName("DeepwalkVerify") + val sc = SparkContext.getOrCreate(sparkConf) + + val edgesRDD = readEdgeListFromHDFS(sc, graphPath, params.getSplitGraph, params.getPartitions) + val negativeEdgesRDD = readUndirectEdgeFromHDFS(sc, negEdgePath, ",", params.getPartitions) + + val nvModel: collection.Map[Long, Vector] = get(readNode2VecModel(sc, embeddingPath)) + + val nvModelBC = sc.broadcast(nvModel) + edgesRDD.foreachPartition(_ => nvModelBC.value) + + val positiveEdgesScores: Array[Double] = edgesRDD + .flatMap({ case (src, dst, weight) => + Try(Iterator.single(cos(nvModelBC.value(src).toArray, nvModelBC.value(dst).toArray))) + .getOrElse(Iterator.empty) + }) + .filter(score => !score.isInfinity && !score.isNaN) + .collect() + + val negativeEdgesScores: Array[Double] = negativeEdgesRDD + .flatMap({ case (src, dst) => + Try(Iterator.single(cos(nvModelBC.value(src).toArray, nvModelBC.value(dst).toArray))) + .getOrElse(Iterator.empty) + }) + .filter(score => !score.isInfinity && !score.isNaN) + .collect() + + val truths = Array.fill(positiveEdgesScores.length)(1) ++ Array.fill(negativeEdgesScores.length)(0) + val auc = AUC.of(truths, (positiveEdgesScores ++ negativeEdgesScores)) + println(s"Link Prediction AUC Score = $auc") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + + */ diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/IncPageRankVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/IncPageRankVerify.scala new file mode 100644 index 0000000..98ec4fc --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/IncPageRankVerify.scala @@ -0,0 +1,171 @@ +package com.bigdata.compare.graph + +import java.text.DecimalFormat + +import scala.collection.mutable + +import org.apache.commons.math3.distribution.NormalDistribution + +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession +import org.apache.spark.storage.StorageLevel +import org.apache.spark.{HashPartitioner, SparkConf, SparkContext} + +object IncPageRankVerify { + val filterFactor = 0.15 + val numLevels = 100 + val start = -1.0 + val end = 5.0 + + def getLevelArray(sc: SparkContext, numNodes: Long): Broadcast[Array[Long]] = { + val levelArray = new Array[Long](numLevels + 1) + val posDis = new NormalDistribution(0, 1.0) + val totalDiff = posDis.cumulativeProbability(end) - posDis.cumulativeProbability(start) + val step = (end - start) / numLevels + for (i <- 0 until numLevels) { + val diff = posDis.cumulativeProbability(-1 + (i + 1) * step) - posDis.cumulativeProbability(-1 + i * step) + val interval = math.floor((diff / totalDiff) * numNodes).toLong + levelArray(i + 1) = interval + levelArray(i) + } + levelArray(numLevels) = numNodes + 1 + sc.broadcast(levelArray) + } + + def getLevel(sc: SparkContext, prRdd: RDD[(Long, Double)]): RDD[(Long, Int)] = { + val finalData = prRdd.filter(_._2 > filterFactor) + val finalDataNum = finalData.count() + val finalData0 = prRdd.filter(_._2 == filterFactor).map(f => (f._1, 0)) + val finalLevelArray = getLevelArray(sc, finalDataNum) + val level = finalData.map{ + case (node, pr) => (math.log(pr - filterFactor), node) + } + .sortBy(v => v) + .zipWithIndex() + .map{ + case ((_, node), index) => (node, finalLevelArray.value.indexWhere(_ >= index)) + } + .union(finalData0) + level.foreachPartition(f => {}) + prRdd.unpersist(false) + level + } + + def getFullLevel(sc: SparkContext, + fullResult: RDD[(Long, Double)], + vertexWithStatus: RDD[(Long, (Int, Double))], + partition: Int, + storageLevel: StorageLevel): RDD[(Long, Int)] = { + val incPr = vertexWithStatus.map(f => (f._1, f._2._1)) + .zipPartitions(fullResult, true)((tagIter, prIter) => { + val tagMap = tagIter.toMap + val prMap = prIter.toMap + val map = new mutable.HashMap[Long, (Int, Double)]() + tagMap.keys.foreach(key => { + map += (key -> (tagMap.getOrElse(key, 2), prMap.getOrElse(key, 0.0))) + }) + map.toIterator + }) + .filter(f => f._2._1 == 1) + .map(f => (f._1, f._2._2)) + .persist(storageLevel) + incPr.foreachPartition(f => {}) + fullResult.unpersist(false) + + val orgPr = vertexWithStatus.filter(f => f._2._1 == 0).map(f => (f._1, f._2._2)).persist(storageLevel) + orgPr.foreachPartition(f => {}) + val prRdd = incPr.union(orgPr).persist(storageLevel) + prRdd.foreachPartition(f => {}) + incPr.unpersist(false) + orgPr.unpersist(false) + + getLevel(sc, prRdd) + } + + def getIncLevel(sc: SparkContext, + incResult: RDD[(Long, Double)], + vertexWithStatus: RDD[(Long, (Int, Double))], + partition: Int, + storageLevel: StorageLevel): RDD[(Long, Int)] = { + val orgPr = vertexWithStatus.filter(f => f._2._1 == 0).map(f => (f._1, f._2._2)).persist(storageLevel) + orgPr.foreachPartition(f => {}) + val prRdd = incResult.union(orgPr).persist(storageLevel) + prRdd.foreachPartition(f => {}) + orgPr.unpersist(false) + + getLevel(sc, prRdd) + } + + def getAccuracy(sc: SparkContext, + graph: RDD[(Long, (Int, Double))], + fullResult: RDD[(Long, Double)], + incResult: RDD[(Long, Double)], + numPart: Int, + outputPath: String = ""): Double = { + val incIndex = getIncLevel(sc, incResult, graph, numPart, StorageLevel.MEMORY_ONLY_SER) + val fullIndex = getFullLevel(sc, fullResult, graph, numPart, StorageLevel.MEMORY_ONLY_SER) + val eval = fullIndex.leftOuterJoin(incIndex).mapValues { + case (gt, res) => res.fold(-1)(v => { + val score = math.abs(gt - v) + if (score > 10) -2 else score + }) + }.map(v => (v._2, 1L)).reduceByKey(_ + _) + + val sum = eval.values.sum() + if (outputPath.length != 0) { + eval.repartition(1).sortBy(_._1) + .map(f => f._1 + "," + f._2 + "," + (f._2 / sum).formatted("%.6f")) + .saveAsTextFile(outputPath) + } + + val accNum = eval.filter(f => f._1 == 0 || f._1 == 1).values.sum() + accNum / sum + } + + def main(args: Array[String]): Unit = { + val master = args(0) + val incPrPath = args(1) + val gdPrPath = args(2) + val split = args(3) + val numPart = args(4).toInt + val outputPath = args(5) + val graphPath = args(6) + + val sparkConf = new SparkConf().setAppName("IncPageRankVerify").setMaster(master) + val spark = SparkSession.builder().config(sparkConf).getOrCreate() + spark.sparkContext.setLogLevel("WARN") + val sc = spark.sparkContext + + val graph = spark.read + .orc(graphPath) + .rdd + .map(row => (row.getAs[Long]("srcId"), + (row.getAs[Int]("srcStatus"), + row.getAs[Double]("pr")))) + .setName("graph") + .partitionBy(new HashPartitioner(numPart)) + .persist(StorageLevel.MEMORY_ONLY_SER) + graph.foreachPartition(f => {}) + + val incResult = sc.textFile(incPrPath).flatMap(line => { + val items = line.trim.split(split) + Iterator.single((items(0).toLong), items(1).toDouble) + }).partitionBy(new HashPartitioner(numPart)) + .persist(StorageLevel.MEMORY_ONLY_SER) + incResult.foreachPartition(f => {}) + + val fullResult = sc.textFile(gdPrPath).flatMap(line => { + val items = line.trim.split(split) + Iterator.single((items(0).toLong), items(1).toDouble) + }).partitionBy(new HashPartitioner(numPart)) + .persist(StorageLevel.MEMORY_ONLY_SER) + fullResult.foreachPartition(f => {}) + + val acc = getAccuracy(sc, graph, fullResult, incResult, numPart, outputPath) + val df = new DecimalFormat("#.00") + val flag = df.format(acc).toDouble >= 0.96 + spark.stop() + println("accuracy: ", acc) + println(s"The algorithm is correct: ${flag}") + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/KCoreVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/KCoreVerify.scala new file mode 100644 index 0000000..6bf0358 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/KCoreVerify.scala @@ -0,0 +1,31 @@ +package com.bigdata.compare.graph + +import org.apache.spark.{SparkConf, SparkContext} + +object KCoreVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val split = "," + val sparkConf = new SparkConf().setAppName("KCoreVerify") + val sc = SparkContext.getOrCreate(sparkConf) + val rdd0 = sc.textFile(path0).map(s => { + val arr = s.split(split) + (arr(0).trim.toLong, arr(1).trim.toInt) + }).cache() + val rdd1 = sc.textFile(path1).map(s => { + val arr = s.split(split) + (arr(0).trim.toLong, arr(1).trim.toInt) + }).cache() + + val cnt0 = rdd0.count() + val cnt1 = rdd1.count() + val statisticR = rdd0.join(rdd1).mapValues(f => f._1 == f._2) + val joinCnt = statisticR.count() + val equalsCnt = statisticR.filter(_._2).count() + val flag = cnt0 == cnt1 && cnt0 == joinCnt && joinCnt == equalsCnt + sc.stop() + println(s"Static Nodes Count: $joinCnt, $cnt0, $cnt1") + println(s"The algorithm is correct: ${flag}") + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/LpaVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/LpaVerify.scala new file mode 100644 index 0000000..23b0d79 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/LpaVerify.scala @@ -0,0 +1,30 @@ +package com.bigdata.compare.graph + +import com.bigdata.graph.Util + +import org.apache.spark.graphx.lib.Modularity +import org.apache.spark.{SparkConf, SparkContext} + +object LpaVerify { + def main(args: Array[String]): Unit = { + val dataset = args(0) + val path0 = args(1) + val path1 = args(2) + val inputSplit = " " + val outputSplit = "," + val partNum = 232 + val sparkConf = new SparkConf().setAppName("LpaVerify") + val sc = SparkContext.getOrCreate(sparkConf) + val nodes0 = Util.readCommFromHDFS(sc, path0, outputSplit, partNum) + val nodes1 = Util.readCommFromHDFS(sc, path1, outputSplit, partNum) + val edges = Util.readGraphFromHDFS(sc, dataset, inputSplit, false, partNum) + val modularity0 = Modularity.run(nodes0, edges, false, partNum) + val modularity1 = Modularity.run(nodes1, edges, false, partNum) + val m0 = modularity0.formatted("%.5f") + val m1 = modularity1.formatted("%.5f") + val flag = m0 <= m1 + sc.stop() + println(s"Modularity: ${modularity0}, ${modularity1}.") + println(s"The algorithm is correct: ${flag}") + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/MceWceVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/MceWceVerify.scala new file mode 100644 index 0000000..66a22f1 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/MceWceVerify.scala @@ -0,0 +1,28 @@ +package com.bigdata.compare.graph + +import org.apache.spark.{SparkConf, SparkContext} + +object MceWceVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + "/clique_info" + val path1 = args(1) + "/clique_info" + val split = "," + val sparkConf = new SparkConf().setAppName("MCE/WCEVerify") + val sc = SparkContext.getOrCreate(sparkConf) + val rdd0 = sc.textFile(path0).map(s => { + val arr = s.split(split) + arr(1) + }).cache() + val rdd1 = sc.textFile(path1).map(s => { + val arr = s.split(split) + arr(1) + }).cache() + + val cnt0 = rdd0.count() + val cnt1 = rdd1.count() + val flag = cnt0 == cnt1 + sc.stop() + println(s"Static Nodes Count: $cnt0, $cnt1") + println(s"The algorithm is correct: ${flag}") + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/MsspVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/MsspVerify.scala new file mode 100644 index 0000000..45fc941 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/MsspVerify.scala @@ -0,0 +1,60 @@ +package com.bigdata.compare.graph + +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark.rdd.RDD +import org.apache.spark.{SparkConf, SparkContext} + +object MsspVerify { + def loadGraph(sc: SparkContext, + input: String, + isWeighted: Boolean, + split: String, + partNum: Int): RDD[(Long, ArrayBuffer[(Int, Double)])] = { + val inputRdd = sc.textFile(input, partNum).flatMap(line => { + if (null == line || line.startsWith("#")) { + Iterator.empty + } else { + var m_line = line + if (m_line.contains("(") || m_line.contains(")")) { + m_line = m_line.replaceAll("\\(", "") + m_line = m_line.replaceAll("\\)", " ") + } + val x = m_line.split(split) + if (x.length < 2) { + Iterator.empty + } else { + val a = x(1).split((" ")) + val b = new ArrayBuffer[(Int, Double)]() + a.map(f => { + val c = f.split(",") + b.append((c(0).toInt, c(1).toDouble)) + }) + Iterator((x(0).toLong, b)) + } + } + }) + inputRdd + } + + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val partNum = 232 + val split = ":" + val sparkConf = new SparkConf().setAppName("MsspVerify") + val sc = SparkContext.getOrCreate(sparkConf) + val rdd0 = loadGraph(sc, path0, true, split, partNum).map(x => (x._1, x._2)) + val rdd1 = loadGraph(sc, path1, true, split, partNum).map(x => (x._1, x._2)) + val res = rdd0.join(rdd1).map(f => { + f._2._1.sortBy(_._1).sameElements(f._2._2.sortBy(_._1)) + }).distinct().collect() + + val cnt0 = rdd0.count() + val cnt1 = rdd1.count() + val flag = !res.contains(false) + sc.stop() + println(s"Static Nodes Count: $cnt0, $cnt1") + println(s"The algorithm is correct: ${flag}") + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/Node2vecVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/Node2vecVerify.scala new file mode 100644 index 0000000..f1f16ae --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/Node2vecVerify.scala @@ -0,0 +1,47 @@ +package com.bigdata.compare.graph + +import scala.util.Try + +import com.bigdata.graph.Util +import smile.validation.AUC; + +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.{SparkConf, SparkContext}; + +object Node2vecVerify { + def main(args: Array[String]): Double = { + val graphPath = args(0) // dataset path + val negEdgePath = args(1) // groundTruth path + val embeddingPath = args(2) // output path + val partNum = args(3).toInt + val weighted = false + val sparkConf = new SparkConf().setAppName("Node2vecVerify") + val sc = SparkContext.getOrCreate(sparkConf) + + val edgesRdd = Util.readEdgeListFromHDFS(sc, graphPath, "\t", weighted, partNum) + val negativeEdgesRdd = Util.readUndirectDataFromHDFS(sc, negEdgePath, ",", partNum) + val nvModel: collection.Map[Long, Vector] = Util.get(Util.readNode2VecModel(sc, embeddingPath)) + val nvModelBC = sc.broadcast(nvModel) + edgesRdd.foreachPartition(_ => nvModelBC.value) + val positiveEdgesScores: Array[Double] = edgesRdd.flatMap({ + case (src, dst, weight) => + Try(Iterator.single(Util.distCos(nvModelBC.value(src).toArray, nvModelBC.value(dst).toArray))) + .getOrElse(Iterator.empty) + }).filter(score => !score.isInfinity && !score.isNaN) + .collect() + val negativeEdgesScores: Array[Double] = negativeEdgesRdd + .flatMap({ + case (src, dst) => + Try(Iterator.single(Util.distCos(nvModelBC.value(src.toLong).toArray, nvModelBC.value(dst.toLong).toArray))) + .getOrElse(Iterator.empty) + }).filter(score => !score.isInfinity && !score.isNaN) + .collect() + val truths = Array.fill(positiveEdgesScores.length)(1) ++ Array.fill(negativeEdgesScores.length)(0) + val auc = AUC.of(truths, (positiveEdgesScores ++ negativeEdgesScores)) + val flag = auc >= 0.90 + sc.stop() + println(s"Link Prediction AUC score = $auc") + println(s"The algorithm is correct: ${flag}") + return auc + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/PageRankVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/PageRankVerify.scala new file mode 100644 index 0000000..fc9db43 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/PageRankVerify.scala @@ -0,0 +1,34 @@ +package com.bigdata.compare.graph + +import org.apache.spark.{SparkConf, SparkContext} + +object PageRankVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val split = "\t" + val sparkConf = new SparkConf().setAppName("PageRankVerify") + val sc = SparkContext.getOrCreate(sparkConf) + val rdd0 = sc.textFile(path0).map(s => { + val arr = s.split(split) + (arr(0).toInt, arr(1).toDouble) + }).cache() + val rdd1 = sc.textFile(path1).map(s => { + val arr = s.split(split) + (arr(0).toInt, arr(1).toDouble) + }).cache() + + val cnt0 = rdd0.filter(_._2 > 0).count() + val cnt1 = rdd1.filter(_._2 > 0).count() + val statisticR = rdd0.join(rdd1).map(f => Math.abs(f._2._1 - f._2._2)).cache() + val joinCnt = statisticR.count() + val maxError = statisticR.max() + val minError = statisticR.min() + val flag = maxError <= 10e-7 + sc.stop() + println(s"Static Nodes Count(>0): $joinCnt, $cnt0, $cnt1") + println(s"Static Max relative Error: $maxError") + println(s"Static Min relative Error: $minError") + println(s"The algorithm is correct: ${flag}") + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/PersonalizedPageRankVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/PersonalizedPageRankVerify.scala new file mode 100644 index 0000000..6c03276 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/PersonalizedPageRankVerify.scala @@ -0,0 +1,77 @@ +package com.bigdata.compare.graph + +import org.apache.spark.{SparkConf, SparkContext} + +object PersonalizedPageRankVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val api = args(2) + val src = args(3) + val split = "\t" + val sparkConf = new SparkConf().setAppName("PersonalizedPageRankVerify") + val sc = SparkContext.getOrCreate(sparkConf) + + var maxError = 0.0 + var minError = 1.0 + api match { + case "fixMS" => + val rdd0 = sc.textFile(path0).map(s => { + val arr = s.split(split) + val len = arr(1).length + val err = arr(1).substring(1, len - 1) + (arr(0).toInt, err) + }).cache() + val rdd1 = sc.textFile(path1).map(s => { + val arr = s.split(split) + val arr1 = arr(1).split(",\\[") + val err = arr1(2).dropRight(2) + (arr(0).toInt, err) + }).cache() + val statisticR = rdd0.join(rdd1).map(f => { + val err1 = f._2._1.split(",") + val err2 = f._2._2.split(",") + var min = 1.0 + var max = 0.0 + for (i <- 0 until src.toInt) { + var err = 0.0 + if (err2(i).toDouble == 0) { + err = Math.abs(err1(i).toDouble - err2(i).toDouble) + } else { + err = Math.abs(err1(i).toDouble - err2(i).toDouble) / err2(i).toDouble + } + min = Math.min(min, err) + max = Math.max(max, err) + } + (min, max) + }).cache() + minError = statisticR.keys.min() + maxError = statisticR.values.max() + case _ => + val rdd0 = sc.textFile(path0).map(s => { + val arr = s.split(split) + (arr(0).toInt, arr(1).toDouble) + }).cache() + val rdd1 = sc.textFile(path1).map(s => { + val arr = s.split(split) + (arr(0).toInt, arr(1).toDouble) + }).cache() + + val statisticR = rdd0.join(rdd1).map(f => { + if (f._2._2 == 0) { + Math.abs(f._2._1 - f._2._2) + } else { + Math.abs(f._2._1 - f._2._2) / f._2._2 + } + }).cache() + maxError = statisticR.max() + minError = statisticR.min() + } + + val flag = maxError <= 10e-7 + sc.stop() + println(s"Static Max relative Error: $maxError") + println(s"Static Min relative Error: $minError") + println(s"The algorithm is correct: ${flag}") + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/SCCVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/SCCVerify.scala new file mode 100644 index 0000000..cfb57db --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/SCCVerify.scala @@ -0,0 +1,42 @@ +package com.bigdata.compare.graph + +import org.apache.spark.{SparkConf, SparkContext} + +object SCCVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val split = "," + val sparkConf = new SparkConf().setAppName("SCCVerify") + val sc = SparkContext.getOrCreate(sparkConf) + val rdd0 = sc.textFile(path0).map(s => { + val arr = s.split(split) + (arr(0).toLong, arr(1).toLong) + }).cache() + val rdd1 = sc.textFile(path1).map(s => { + val arr = s.split(split) + (arr(0).toLong, arr(1).toLong) + }).cache() + val rdd00 = rdd0.groupBy(_._2).flatMap(f => { + val arr = f._2.toArray + val minId = arr.map(_._1).min + arr.map(f => (f._1, minId)) + }).cache() + val rdd11 = rdd1.groupBy(_._2).flatMap(f => { + val arr = f._2.toArray + val minId = arr.map(_._1).min + arr.map(f => (f._1, minId)) + }).cache() + + val cnt0 = rdd0.count() + val cnt1 = rdd1.count() + val statisticR = rdd00.join(rdd11).filter { + case (vid, (sc1, sc2)) => sc1 == sc2 + } + val joinCnt = statisticR.count() + val flag = cnt0 == cnt1 && cnt0 == joinCnt + sc.stop() + println(s"Static Nodes Count: $joinCnt, $cnt0, $cnt1") + println(s"The algorithm is correct: ${flag}") + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/TrillionPageRankVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/TrillionPageRankVerify.scala new file mode 100644 index 0000000..b145989 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/TrillionPageRankVerify.scala @@ -0,0 +1,33 @@ +package com.bigdata.compare.graph + +import org.apache.spark.{SparkConf, SparkContext} + +object TrillionPageRankVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val split = "\t" + val sparkConf = new SparkConf().setAppName("TrillionPageRankVerify") + val sc = SparkContext.getOrCreate(sparkConf) + val rdd0 = sc.textFile(path0).map(s => { + val arr = s.split(split) + (arr(0).toInt, arr(1).toDouble) + }).cache() + val rdd1 = sc.textFile(path1).map(s => { + val arr = s.split(split) + (arr(0).toInt, arr(1).toDouble) + }).cache() + val cnt0 = rdd0.map(_._1).count() + val cnt1 = rdd1.map(_._1).count() + val statisticR = rdd0.join(rdd1).map(f => Math.abs(f._2._1 / cnt0 - f._2._2 / cnt1)).cache() + val joinCnt = statisticR.count() + val maxError = statisticR.max() + val minError = statisticR.min() + val flag = maxError <= 1e-7 + sc.stop() + println(s"Static Nodes Count: $joinCnt, $cnt0, $cnt1") + println(s"Static Max relative Error: $maxError") + println(s"Static Min relative Error: $minError") + println(s"The algorithm is correct: ${flag}") + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/TrustRankVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/TrustRankVerify.scala new file mode 100644 index 0000000..5aa0eba --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/TrustRankVerify.scala @@ -0,0 +1,40 @@ +package com.bigdata.compare.graph + +import org.apache.spark.{SparkConf, SparkContext} + +object TrustRankVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val split = "\t" + val sparkConf = new SparkConf().setAppName("TrustRankVerify") + val sc = SparkContext.getOrCreate(sparkConf) + val rdd0 = sc.textFile(path0).map(s => { + val arr = s.split(split) + (arr(0).toInt, arr(1).toDouble) + }).cache() + val rdd1 = sc.textFile(path1).map(s => { + val arr = s.split(split) + (arr(0).toInt, arr(1).toDouble) + }).cache() + + val cnt0 = rdd0.filter(_._2 > 0).count() + val cnt1 = rdd1.filter(_._2 > 0).count() + val statisticR = rdd0.join(rdd1).map(f => { + if (f._2._2 != 0) { + Math.abs(f._2._1 - f._2._2) / f._2._2 + } else { + Math.abs(f._2._1 - f._2._2) + } + }).cache() + val joinCnt = statisticR.count() + val maxError = statisticR.max() + val minError = statisticR.min() + val flag = maxError <= 10e-7 + sc.stop() + println(s"Static Nodes Count(>0): $joinCnt, $cnt0, $cnt1") + println(s"Static Max relative Error: $maxError") + println(s"Static Min relative Error: $minError") + println(s"The algorithm is correct: ${flag}") + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/graph/WeightedPageRankVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/WeightedPageRankVerify.scala new file mode 100644 index 0000000..5c07687 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/graph/WeightedPageRankVerify.scala @@ -0,0 +1,29 @@ +package com.bigdata.compare.graph + +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.functions.{col, udf} + +object WeightedPageRankVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) //origin path + val path1 = args(1) //opt path + + val sparkConf = new SparkConf().setAppName("WeightedPageRankVerify") + val spark = SparkSession.builder().config(sparkConf).getOrCreate() + + val toDUDF = udf((s: String) => s.toDouble) + val toLUDF = udf((s: String) => s.toLong) + val data0 = spark.read.parquet(path0) + val data1 = spark.read.option("SEP", ",").csv(path1) + .withColumn("pr1", toDUDF(col("_c1"))) + .withColumn("_c0", toDUDF(col("_c0"))) + val result = data0.join(data1, data0("id") === data1("_c0")) + val toDiff = udf((i1: Double, i2: Double) => Math.abs(i1 - i2) / i1) + val summ = result.withColumn("diff", toDiff(col("value"), col("pr1"))).cache() + summ.select("diff").summary().show(false) + val maxError = summ.select("diff").rdd.map(r => r.getAs[Double]("diff")).max() + val flag = maxError <= 1e-7 + println(s"The algorithm is correct: ${flag}") + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/DTBVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/DTBVerify.scala new file mode 100644 index 0000000..4bdb9d1 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/DTBVerify.scala @@ -0,0 +1,44 @@ +package com.bigdata.compare.ml + +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.sql.SparkSession + +import java.io.FileWriter + +object DTBVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val sparkConf = new SparkConf().setAppName("DTBVerify") + val spark = SparkSession.builder.config(sparkConf).getOrCreate() + val isCorrect = compareRes(path0, path1, spark) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${isCorrect}:\n${path0}\n${path1}\n") + writerIsCorrect.close() + println(s"The algorithm is correct: ${isCorrect}") + + } + + def compareRes(path0: String, path1: String, spark: SparkSession): String = { + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val res1File = new Path(path0) + val res2File = new Path(path1) + if (!fs.exists(res1File) || !fs.exists(res2File)) { + return "invaildComparison" + } + + val output = sc.textFile(path0).repartition(100) + val refRes = sc.textFile(path1).repartition(100) + val dataDiff1Cnt = output.subtract(refRes).count() + val dataDiff2Cnt = refRes.subtract(output).count() + if (dataDiff1Cnt != 0 || dataDiff2Cnt != 0) { + System.err.println(s"[ERROR] diff1Cnt: ${dataDiff1Cnt}, diff2Cnt: ${dataDiff2Cnt}") + System.err.println("output data is mismatch!") + return "false" + } else { + return "true" + } + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/DownEvaluationVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/DownEvaluationVerify.scala new file mode 100644 index 0000000..1179087 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/DownEvaluationVerify.scala @@ -0,0 +1,41 @@ +package com.bigdata.compare.ml + +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.sql.SparkSession + +import java.io.FileWriter + +object DownEvaluationVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val sparkConf = new SparkConf().setAppName("DownUpEvaluationVerify") + val spark = SparkSession.builder.config(sparkConf).getOrCreate() + val isCorrect = compareRes(path0, path1, spark) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${isCorrect}:\n${path0}\n${path1}\n") + writerIsCorrect.close() + println(s"The algorithm is correct: ${isCorrect}") + + } + + def compareRes(path0: String, path1: String, spark: SparkSession): String = { + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val res1File = new Path(path0) + val res2File = new Path(path1) + if (!fs.exists(res1File) || !fs.exists(res2File)) { + return "invaildComparison" + } + val res1 = sc.textFile(path0).collect().head.toDouble + val res2 = sc.textFile(path1).collect().head.toDouble + if (res1 * (1 - 0.005) - res2 <= 0.0) { + return "true" + } + else { + return "false" + } + } + +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/EncoderVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/EncoderVerify.scala new file mode 100644 index 0000000..72a82e5 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/EncoderVerify.scala @@ -0,0 +1,57 @@ +package com.bigdata.compare.ml + +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.sql.SparkSession + +import java.io.FileWriter + +object EncoderVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val sparkConf = new SparkConf().setAppName("DownUpEvaluationVerify") + val spark = SparkSession.builder.config(sparkConf).getOrCreate() + val isCorrect = compareRes(path0, path1, spark) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${isCorrect}:\n${path0}\n${path1}\n") + writerIsCorrect.close() + println(s"The algorithm is correct: ${isCorrect}") + + } + + def compareRes(path0: String, path1: String, spark: SparkSession): String = { + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val res1File = new Path(path0) + val res2File = new Path(path1) + if (!fs.exists(res1File) || !fs.exists(res2File)) { + return "invaildComparison" + } + val data1 = spark.read.parquet(path0) + val data2 = spark.read.parquet(path1) + data1.show() + val rdd1 = data1.rdd.map{ + r => + val field1 = r.getAs[Seq[Int]]("7xxx_index")(0) + val field2 = r.getAs[Seq[Int]]("15xxx_index")(0) + (field1, field2) + } + val rdd2 = data2.rdd.map{ + r => + val field1 = r.getAs[Seq[Int]]("7xxx_index")(0) + val field2 = r.getAs[Seq[Int]]("15xxx_index")(0) + (field1, field2) + } + val diff = rdd1.subtract(rdd2).count() + println(s"Exec Successful: different count: ${diff}") + if (diff == 0) { + return "false" + } + else { + return "true" + } + + } + +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/EvaluationVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/EvaluationVerify.scala new file mode 100644 index 0000000..3b47fb8 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/EvaluationVerify.scala @@ -0,0 +1,42 @@ +package com.bigdata.compare.ml + +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.sql.SparkSession + +import java.io.FileWriter + +object EvaluationVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val sparkConf = new SparkConf().setAppName("EvaluationVerify") + val spark = SparkSession.builder.config(sparkConf).getOrCreate() + val isCorrect = compareRes(path0, path1, spark) + val writerIsCorrect = new FileWriter(s"report/!ml_isCorrect.txt", true) + writerIsCorrect.write(s"${isCorrect}:\n${path0}\n${path1}\n") + writerIsCorrect.close() + println(s"The algorithm is correct: ${isCorrect}") + + } + + def compareRes(path0: String, path1: String, spark: SparkSession): String = { + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val res1File = new Path(path0) + val res2File = new Path(path1) + if (fs.exists(res1File) && fs.exists(res2File)) { + val res1 = sc.textFile(path0).collect() + val res2 = sc.textFile(path1).collect() + if (math.abs(res1(0).toDouble - res2(0).toDouble) / res1(0).toDouble <= 0.005) { + return "true" + } + else { + return "false" + } + } + else{ + return "invaildComparison" + } + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/FPGVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/FPGVerify.scala new file mode 100644 index 0000000..368e4af --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/FPGVerify.scala @@ -0,0 +1,53 @@ +package com.bigdata.compare.ml + +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.Dataset + +import java.io.FileWriter + +object FPGVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val sparkConf = new SparkConf().setAppName("FPGVerify") + val spark = SparkSession.builder.config(sparkConf).getOrCreate() + val isCorrect = compareRes(path0, path1, spark) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${isCorrect}:\n${path0}\n${path1}\n") + writerIsCorrect.close() + println(s"The algorithm is correct: ${isCorrect}") + + } + + def saveRes(df: Dataset[String], saveDataPath: String, sc: SparkContext): Unit = { + val fs = FileSystem.get(sc.hadoopConfiguration) + val saveFile = new Path(saveDataPath) + if (fs.exists(saveFile)) { + fs.delete(saveFile, true) + } + df.rdd.saveAsTextFile(saveDataPath) + } + + def compareRes(path0: String, path1: String, spark: SparkSession): String = { + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val res1File = new Path(path0) + val res2File = new Path(path1) + if (fs.exists(res1File) && fs.exists(res2File)) { + val res1 = sc.textFile(path0) + val res2 = sc.textFile(path1) + if (res1.subtract(res2).take(1).isEmpty && + res2.subtract(res1).take(1).isEmpty ) { + return "correct" + } else { + return "incorrect" + } + } + else{ + return "invaildComparison" + } + } + +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/IDFVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/IDFVerify.scala new file mode 100644 index 0000000..81a258d --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/IDFVerify.scala @@ -0,0 +1,61 @@ +package com.bigdata.compare.ml + +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.sql.SparkSession + +import java.io.FileWriter + +object IDFVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val sparkConf = new SparkConf().setAppName("IDFVerify") + val spark = SparkSession.builder.config(sparkConf).getOrCreate() + val isCorrect = compareRes(path0, path1, spark) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${isCorrect}:\n${path0}\n${path1}\n") + writerIsCorrect.close() + println(s"The algorithm is correct: ${isCorrect}") + + } + + def saveRes(res: Array[Double], saveDataPath: String, sc: SparkContext): Unit = { + val fs = FileSystem.get(sc.hadoopConfiguration) + val saveFile = new Path(saveDataPath) + if (fs.exists(saveFile)) { + fs.delete(saveFile, true) + } + sc.parallelize(res).repartition(100).saveAsTextFile(saveDataPath) + } + + def isEqualRes(res1: Array[Double], res2: Array[Double]): Boolean = { + if (res1.length != res2.length) + return false + for (i <- res2.indices) { + if (math.abs(res1(i) - res2(i)) > 1e-6) + return false + } + true + } + + def compareRes(path0: String, path1: String, spark: SparkSession): String = { + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val res1File = new Path(path0) + val res2File = new Path(path1) + if (fs.exists(res1File) && fs.exists(res2File)) { + val res1 = sc.textFile(path0).map(_.toDouble).collect().toArray + val res2 = sc.textFile(path1).map(_.toDouble).collect().toArray + val numMismatch = res2.seq.zip(res1).count(v => math.abs(v._1 - v._2) > 1E-1) + if (numMismatch != 0 ) { + return "true" + } else { + return "false" + } + } + else{ + return "invaildComparison" + } + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/KNNVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/KNNVerify.scala new file mode 100644 index 0000000..14f76d8 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/KNNVerify.scala @@ -0,0 +1,214 @@ +package com.bigdata.compare.ml + +import java.io.{FileWriter, PrintWriter} +import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.neighbors.KNN +import org.apache.spark.sql.{Row, SparkSession} + +object KNNVerify extends Serializable { + + /** + * Calculate euclidean distance. + */ + def euclideanDistance(v1: Vector, v2: Vector): Double = { + euclideanDistance(v1.toArray, v2.toArray) + } + + def euclideanDistance(v1: Array[Double], v2: Array[Double]): Double = { + math.sqrt(v1.indices.map(i => math.pow(v1(i) - v2(i), 2)).sum) + } + + /** + * 通过暴力求解,得到KNN的真实解 + */ + def writeResult(args: Array[String]): Unit = { + var exeTime = System.currentTimeMillis() + + val spark = SparkSession + .builder() + .appName("writeResults") + .getOrCreate() + val sc = spark.sparkContext + sc.setLogLevel("ERROR") + + // 算法参数 + var pt, k, testNum, testBatchSize = -1 + var dataPath = "" + + // 待写入的真实解本地路径 + var groundTruthLocalPath = "" + + args.sliding(2, 2).foreach { + case Array("--pt", value) => pt = value.toInt + case Array("--k", value) => k = value.toInt + case Array("--testNum", value) => testNum = value.toInt + case Array("--testBatchSize", value) => testBatchSize = value.toInt + case Array("--dataPath", value) => dataPath = value.toString + case Array("--groundTruthLocalPath", value) => groundTruthLocalPath = value.toString + case _ => + } + + // read data + val rawData = sc.textFile(dataPath) + .map(line => { + val arr = line.split("\t") + val id = arr(0).toLong + val feature = Vectors.dense(arr(1).split(",").map(_.toDouble)) + (id, feature) + }).cache() + + // split train/test datasets + val trainData = rawData.filter(_._1 >= testNum).repartition(pt).cache() + val testData = rawData.filter(_._1 < testNum).repartition(pt).cache() + println(s"-------- split data, trainNum=${trainData.count()}, testNum=${testData.count()} ----------") + rawData.unpersist(blocking = true) + + // search in batch + for(startIdx <- 0 until testNum by testBatchSize) { + exeTime = System.currentTimeMillis() + val endIdx = math.min(startIdx + testBatchSize, testNum) + val queryLocal = testData.filter(x => x._1 >= startIdx && x._1 < endIdx).collect() + val queryBd = sc.broadcast(queryLocal) + val neighbors = trainData.mapPartitions(iter => { + val curTrainData = iter.toArray + Iterator(queryBd.value.map{case (queryIdx, queryVector) => { + val distances = curTrainData.map{case (trainIdx, trainVector) => + (trainIdx, euclideanDistance(trainVector, queryVector))} + .sortBy(t => (t._2, t._1)).take(k) + (queryIdx, distances) + }}) + }).treeReduce((arr1, arr2) => { + arr1.indices.toArray.map(i => { + (arr1(i)._1, (arr1(i)._2 ++ arr2(i)._2).sortBy(t => (t._2, t._1)).take(k)) + }) + }, depth = 3) + val writer = new PrintWriter(s"${groundTruthLocalPath}/part-${startIdx}-${endIdx}") + neighbors.foreach{case(queryIdx, queryNN) => { + writer.write(queryIdx + "\t" + queryNN.map(_._1).mkString(",") + "\n") + }} + writer.close() + println(s"------ $startIdx-$endIdx done, time=${(System.currentTimeMillis() - exeTime) / 60000.0} ---------") + queryBd.destroy() + } + + spark.stop() + } + + def verify(args: Array[String]): Unit = { + val exeTime = System.currentTimeMillis() + val spark = SparkSession + .builder() + .appName("writeResults") + .getOrCreate() + val sc = spark.sparkContext + sc.setLogLevel("ERROR") + + // 算法参数 + var pt, k, testNum, testBatchSize = -1 + var dataPath = "" + var datasetName = "" + + // 真实解hdfs路径 + var groundTruthHDFSPath = "" + + args.sliding(2, 2).foreach { + case Array("--pt", value) => pt = value.toInt + case Array("--k", value) => k = value.toInt + case Array("--testNum", value) => testNum = value.toInt + case Array("--testBatchSize", value) => testBatchSize = value.toInt + case Array("--dataPath", value) => dataPath = value.toString + case Array("--dataset_name", value) => datasetName = value.toString + case Array("--groundTruthHDFSPath", value) => groundTruthHDFSPath = value.toString + case _ => + } + + // read data + import spark.implicits._ + val rawData = spark.sparkContext.textFile(dataPath) + .map(line => { + val arr = line.split("\t") + val id = arr(0).toLong + val feature = Vectors.dense(arr(1).split(",").map(_.toDouble)) + (id, feature) + }).toDF("id", "features").cache() + + // split train/test datasets + val trainDataDF = rawData.filter($"id" >= testNum).repartition(pt).cache() + val testDataDF = rawData.filter($"id" < testNum).repartition(pt).cache() + trainDataDF.count() + testDataDF.count() + rawData.unpersist(blocking = true) + + // fit + val model = new KNN() + .setFeaturesCol("features") + .setAuxiliaryCols(Array("id")) + .fit(trainDataDF) + + // transform + val testResults = model + .setNeighborsCol("neighbors") + .setDistanceCol("distances") + .setK(k) + .setTestBatchSize(testBatchSize) + .transform(testDataDF).cache() + testResults.count() + println(s"trainTime=${(System.currentTimeMillis() - exeTime) / 1000.0}") + + // 与groudtruth对比 + val trueResult = sc.textFile(groundTruthHDFSPath).map(line => { + val arr = line.split("\t") + val id = arr(0).toLong + val neighbors = arr(1).split(",").map(_.toInt) + .filter(neighborIdx => neighborIdx >= testNum).take(k) + (id, neighbors) + }).filter(_._2.length == k) + val combinedData = trueResult.toDF("id", "trueNeighbors") + .join(testResults.selectExpr("id", "neighbors", "distances"), "id") + .map(r => { + val trueNN = r.getAs[Seq[Int]]("trueNeighbors").toArray + val myNN = r.getAs[Seq[Row]]("neighbors").map(_.getAs[Long]("id").toInt).toArray + val myDistancesSize = r.getAs[Seq[Double]]("distances").toSet.size + (r.getAs[Long]("id"), trueNN, myNN, myDistancesSize) + }) + .filter(_._4 == k) + .cache() + // val actualTotalNum = combinedData.count() + + val incorrectCases = combinedData.map{case (id, trueNN, myNN, _) => { + val myNNSet = myNN.toSet + var isEqual = true + Range(0, k - 1).foreach(i => { + if(!myNNSet.contains(trueNN(i))) + isEqual = false + }) + (id, isEqual, trueNN, myNN) + }}.filter(!_._2).collect() + + var isCorrect = "" + if(incorrectCases.length == 0) + isCorrect = "correct" + else + isCorrect = "incorrect" + println(s"Exec Successful: isCorrect: ${isCorrect}") + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"KNN_${datasetName} ${isCorrect} \n") + writerIsCorrect.close() + spark.stop() + } + + def main(args: Array[String]): Unit = { + + var task: String = "" + args.sliding(2, 2).foreach { + case Array("--task", value) => task = value.toString.toLowerCase() + case _ => + } + require(Array("write", "verify").contains(task), s"Task name should be one of [write, verify], but got $task") + + task match { + case "write" => writeResult(args) + case "verify" => verify(args) + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/LDAVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/LDAVerify.scala new file mode 100644 index 0000000..55fc91f --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/LDAVerify.scala @@ -0,0 +1,42 @@ +package com.bigdata.compare.ml + +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.sql.SparkSession + +import java.io.FileWriter + +object LDAVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val sparkConf = new SparkConf().setAppName("LDAVerify") + val spark = SparkSession.builder.config(sparkConf).getOrCreate() + val isCorrect = compareRes(path0, path1, spark) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${isCorrect}:\n${path0}\n${path1}\n") + writerIsCorrect.close() + println(s"The algorithm is correct: ${isCorrect}") + + } + + def compareRes(path0: String, path1: String, spark: SparkSession): String = { + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val res1File = new Path(path0) + val res2File = new Path(path1) + if (!fs.exists(res1File) || !fs.exists(res2File)) { + return "invaildComparison" + } + val res1 = sc.textFile(path0).collect().head.toDouble + val res2 = sc.textFile(path1).collect().head.toDouble + if ((res2 - res1) / math.abs(res2) <= 0.005) { + return "true" + } + else { + return "false" + } + } + +} + diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/MatrixVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/MatrixVerify.scala new file mode 100644 index 0000000..94d6ba9 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/MatrixVerify.scala @@ -0,0 +1,88 @@ +package com.bigdata.compare.ml + +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.mllib.linalg.DenseMatrix +import org.apache.spark.sql.SparkSession + +import java.io.FileWriter + +object MatrixVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val sparkConf = new SparkConf().setAppName("MatrixVerify") + val spark = SparkSession.builder.config(sparkConf).getOrCreate() + val isCorrect = compareRes(path0, path1, spark) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${isCorrect}:\n${path0}\n${path1}\n") + writerIsCorrect.close() + println(s"The algorithm is correct: ${isCorrect}") + + } + + def saveMatrix(mat: DenseMatrix, saveDataPath: String, sc: SparkContext): Unit = { + val fs = FileSystem.get(sc.hadoopConfiguration) + val saveFile = new Path(saveDataPath) + if (fs.exists(saveFile)) { + fs.delete(saveFile, true) + } + val res = toRowMajorArray(mat) + sc.parallelize(res.map(_.mkString(";"))).saveAsTextFile(saveDataPath) + } + + + def compareRes(path0: String, path1: String, spark: SparkSession): String = { + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val res1File = new Path(path0) + val res2File = new Path(path1) + if (fs.exists(res1File) && fs.exists(res2File)) { + val res1 = sc.textFile(path0).map(line => line.split(";").map(_.toDouble)).collect() + val res2 = sc.textFile(path1).map(line => line.split(";").map(_.toDouble)).collect() + if (isEqualMatrix(res1, res2)) { + return "true" + } + else { + return "false" + } + } + else{ + return "invaildComparison" + } + } + + def isEqualMatrix(res1: Array[Array[Double]], res2: Array[Array[Double]]): Boolean = { + if (res1.length != res2.length) + return false + for (i <- res2.indices) { + if (res1(i).length != res2(i).length) + return false + for (j <- res1(i).indices) { + if (math.abs(math.abs(res1(i)(j)) - math.abs(res2(i)(j))) > 1e-6) + return false + } + } + true + } + + def toRowMajorArray(matrix: DenseMatrix): Array[Array[Double]] = { + val nRow = matrix.numRows + val nCol = matrix.numCols + val arr = new Array[Array[Double]](nRow).map(_ => new Array[Double](nCol)) + if(matrix.isTransposed){ + var srcOffset = 0 + for{i <- 0 until nRow} { + System.arraycopy(matrix.values, srcOffset, arr(i), 0, nCol) + srcOffset += nCol + } + } else { + matrix.values.indices.foreach(idx => { + val j = math.floor(idx / nRow).toInt + val i = idx % nRow + arr(i)(j) = matrix.values(idx) + }) + } + arr + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/PrefixSpanVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/PrefixSpanVerify.scala new file mode 100644 index 0000000..7778063 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/PrefixSpanVerify.scala @@ -0,0 +1,58 @@ +package com.bigdata.compare.ml + +import io.airlift.compress.lz4.Lz4Codec +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.hadoop.io.NullWritable +import org.apache.spark.sql.SparkSession +import org.apache.spark.rdd +import org.apache.spark.rdd.RDD + +import java.io.FileWriter + +object PrefixSpanVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val sparkConf = new SparkConf().setAppName("EvaluationVerify") + val spark = SparkSession.builder.config(sparkConf).getOrCreate() + val isCorrect = compareRes(path0, path1, spark) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${isCorrect}:\n${path0}\n${path1}\n") + writerIsCorrect.close() + println(s"The algorithm is correct: ${isCorrect}") + + } + + def saveRes(res: RDD[String], savePath: String, sc: SparkContext): Unit ={ + val fs = FileSystem.get(sc.hadoopConfiguration) + val saveFile = new Path(savePath) + if (fs.exists(saveFile)) { + fs.delete(saveFile, true) + } + res.repartition(15).map((NullWritable.get(), _)) + .saveAsSequenceFile(savePath, Some(classOf[Lz4Codec])) + + } + + def compareRes(path0: String, path1: String, spark: SparkSession): String = { + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val res1File = new Path(path0) + val res2File = new Path(path1) + if (fs.exists(res1File) && fs.exists(res2File)) { + val res1 = sc.sequenceFile[NullWritable, String](path0).map(_._2).persist() + val res2 = sc.sequenceFile[NullWritable, String](path1).map(_._2).persist() + if (res2.subtract(res1).take(1).isEmpty && + res1.subtract(res2).take(1).isEmpty) { + return "true" + } + else { + return "false" + } + } + else{ + return "invaildComparison" + } + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/SVDVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/SVDVerify.scala new file mode 100644 index 0000000..a6e5efd --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/SVDVerify.scala @@ -0,0 +1,115 @@ +package com.bigdata.compare.ml + +import org.apache.spark.mllib.linalg.DenseVector +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.mllib.linalg.DenseMatrix +import org.apache.spark.sql.SparkSession + +import java.io.FileWriter + +object SVDVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val sparkConf = new SparkConf().setAppName("SVDVerify") + val spark = SparkSession.builder.config(sparkConf).getOrCreate() + val isCorrect = compareRes(path0, path1, spark) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${isCorrect}:\n${path0}\n${path1}\n") + writerIsCorrect.close() + println(s"The algorithm is correct: ${isCorrect}") + + } + + def saveMatrix(mat: DenseMatrix, saveDataPath: String, sc: SparkContext): Unit = { + val fs = FileSystem.get(sc.hadoopConfiguration) + val saveFile = new Path(saveDataPath) + if (fs.exists(saveFile)) { + fs.delete(saveFile, true) + } + val res = toRowMajorArray(mat) + sc.parallelize(res.map(_.mkString(";"))).saveAsTextFile(saveDataPath) + } + + def saveVector(vec: DenseVector, saveDataPath: String, sc: SparkContext): Unit = { + val fs = FileSystem.get(sc.hadoopConfiguration) + val saveFile = new Path(saveDataPath) + if (fs.exists(saveFile)) { + fs.delete(saveFile, true) + } + val res = vec.toArray + sc.parallelize(res).repartition(1).saveAsTextFile(saveDataPath) + } + + + def compareRes(path0: String, path1: String, spark: SparkSession): String = { + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val sigma1File = new Path(s"${path0}/s") + val sigma2File = new Path(s"${path1}/s") + val V1File = new Path(s"${path0}/V") + val V2File = new Path(s"${path1}/V") + if (fs.exists(sigma1File) && fs.exists(sigma2File) && fs.exists(V1File) && fs.exists(V2File)) { + val s1 = sc.textFile(s"${path0}/s").map(_.toDouble).collect() + val s2 = sc.textFile(s"${path1}/s").map(_.toDouble).collect() + val V1 = sc.textFile(s"${path0}/V").map(line => line.split(";").map(_.toDouble)).collect() + val V2 = sc.textFile(s"${path1}/V").map(line => line.split(";").map(_.toDouble)).collect() + if (isEqualVector(s1, s2) && isEqualMatrix(V1, V2)) { + return "true" + } + else { + return "false" + } + } + else{ + return "invaildComparison" + } + } + + def isEqualMatrix(res1: Array[Array[Double]], res2: Array[Array[Double]]): Boolean = { + if (res1.length != res2.length) + return false + for (i <- res2.indices) { + if (res1(i).length != res2(i).length) + return false + for (j <- res1(i).indices) { + if (math.abs(math.abs(res1(i)(j)) - math.abs(res2(i)(j))) > 1e-6) + return false + } + } + true + } + + def isEqualVector(optres: Array[Double], rawres: Array[Double]): Boolean = { + val res1 = optres.sorted + val res2 = rawres.sorted + if (res1.length != res2.length) + return false + for (i <- res2.indices) { + if ((math.abs(res1(i) - res2(i)) / res1(i)) > 0.001) + return false + } + true + } + + def toRowMajorArray(matrix: DenseMatrix): Array[Array[Double]] = { + val nRow = matrix.numRows + val nCol = matrix.numCols + val arr = new Array[Array[Double]](nRow).map(_ => new Array[Double](nCol)) + if(matrix.isTransposed){ + var srcOffset = 0 + for{i <- 0 until nRow} { + System.arraycopy(matrix.values, srcOffset, arr(i), 0, nCol) + srcOffset += nCol + } + } else { + matrix.values.indices.foreach(idx => { + val j = math.floor(idx / nRow).toInt + val i = idx % nRow + arr(i)(j) = matrix.values(idx) + }) + } + arr + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/SimRankVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/SimRankVerify.scala new file mode 100644 index 0000000..ea44058 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/SimRankVerify.scala @@ -0,0 +1,61 @@ +// scalastyle:off +package com.bigdata.compare.ml + +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.{SparkConf, SparkContext} + +import java.io.FileWriter + +object SimRankVerify { + val EPS = 1e-7 + + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + + val conf = new SparkConf().setAppName("SimRankVerify") + val spark = SparkSession.builder().config(conf).getOrCreate() + + val isCorrect = compareRes(path0, path1, spark) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${isCorrect}:\n${path0}\n${path1}\n") + writerIsCorrect.close() + println(s"The algorithm is correct: ${isCorrect}") + + spark.stop() + } + + def saveRes(res1: DataFrame, res2: DataFrame, savePath: String, sc: SparkContext): Unit = { + val fs = FileSystem.get(sc.hadoopConfiguration) + val saveFile = new Path(savePath) + if (fs.exists(saveFile)) { + fs.delete(saveFile, true) + } + res1.foreach(_ => {}) + res2.foreach(_ => {}) + res1.write.mode("overwrite").option("header", value = true).csv(s"${savePath}/user") + res2.write.mode("overwrite").option("header", value = true).csv(s"${savePath}/item") + } + + def compareRes(path0: String, path1: String, spark: SparkSession): String = { + var isCorrect = "true" + val res0UserPath = s"${path0}/user" + val res0ItemPath = s"${path0}/item" + val res1UserPath = s"${path1}/user" + val res1ItemPath = s"${path1}/item" + val res0User = spark.read.option("header", value = true).option("inferSchema", value = true).csv(res0UserPath) + val res0Item = spark.read.option("header", value = true).option("inferSchema", value = true).csv(res0ItemPath) + val res1User = spark.read.option("header", value = true).option("inferSchema", value = true).csv(res1UserPath) + val res1Item = spark.read.option("header", value = true).option("inferSchema", value = true).csv(res1ItemPath) + val userSim = res0User.join(res1User, Seq("user1", "user2"), "full") + val itemSim = res0Item.join(res1Item, Seq("item1", "item2"), "full") + userSim.foreach(row => + {if(math.abs(row.getDouble(2) - row.getDouble(3)) >= EPS) + isCorrect = "false"}) + itemSim.foreach(row => + {if(math.abs(row.getDouble(2) - row.getDouble(3)) >= EPS) + isCorrect = "false"}) + isCorrect + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/TEVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/TEVerify.scala new file mode 100644 index 0000000..e35b9f5 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/TEVerify.scala @@ -0,0 +1,87 @@ +package com.bigdata.compare.ml + +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.sql.SparkSession +import org.apache.spark.ml.linalg.Vector + +import java.io.FileWriter + +object TEVerify { + val TOLERANCE = 1E-15 + + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val sparkConf = new SparkConf().setAppName("TEVerify") + val spark = SparkSession.builder.config(sparkConf).getOrCreate() + val isCorrect = compareRes(path0, path1, spark) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${isCorrect}:\n${path0}\n${path1}\n") + writerIsCorrect.close() + println(s"The algorithm is correct: ${isCorrect}") + + } + + def compareRes(path0: String, path1: String, spark: SparkSession): String = { + var isCorrect = "true" + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val encodedTrainSavePath = s"${path0}/encodedTrain" + val encodedTestSavePath = s"${path0}/encodedTest" + val h2oEncodedTrainSavePath = s"${path1}/encodedTrain" + val h2oEncodedTestSavePath = s"${path1}/encodedTest" + val optTrainFile = new Path(encodedTrainSavePath) + val optTestFile = new Path(encodedTestSavePath) + val rawTrainFile = new Path(h2oEncodedTrainSavePath) + val rawTestFile = new Path(h2oEncodedTestSavePath) + if (!fs.exists(optTrainFile) || !fs.exists(optTestFile) || !fs.exists(rawTrainFile) || !fs.exists(rawTestFile)) { + isCorrect = "invaildComparison" + return isCorrect + } + println("Start functionality verification") + println("loading parquet1") + val trainDF1 = spark.read.parquet(encodedTrainSavePath) + val colsToTrainCompare = trainDF1.columns.filter(_.endsWith("_te")) + println("collecting encodings1") + val encoded1 = colsToTrainCompare.map(col => trainDF1.select(col).rdd.map(_.getAs[Vector](0).toArray(0)).collect().sorted) + + println("loading parquet2") + val trainDF2 = spark.read.parquet(h2oEncodedTrainSavePath) + println("collecting encodings2") + val encoded2 = colsToTrainCompare.map(col => trainDF2.select(col).rdd.map(_.getAs[Vector](0).toArray(0)).collect().sorted) + if (encoded1.length != encoded2.length) { + isCorrect = "false" + return isCorrect + } + encoded1.zip(encoded2).map { xy => + xy._1.zip(xy._2).map { ab => + if (math.abs(ab._1 - ab._2) > TOLERANCE) { + isCorrect = "false" + return isCorrect + } + } + } + println("Train results all close.") + + val testDF1 = spark.read.parquet(encodedTestSavePath) + val testDF2 = spark.read.parquet(h2oEncodedTestSavePath) + val colsToTestCompare = testDF1.columns.filter(_.endsWith("_te")) + for (col <- colsToTestCompare) { + val res1 = testDF1.select(col).rdd.map(_.getAs[Vector](0).toArray(0)).collect().sorted + val res2 = testDF2.select(col).rdd.map(_.getAs[Vector](0).toArray(0)).collect().sorted + if (res1.length != res2.length) { + isCorrect = "false" + return isCorrect + } + res1.zip(res2).map { ab => + if (math.abs(ab._1 - ab._2) > TOLERANCE) { + isCorrect = "false" + } + } + } + println("Test results all close.") + println("Functionality OK") + isCorrect + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/UpEvaluationVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/UpEvaluationVerify.scala new file mode 100644 index 0000000..1de9cb4 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/UpEvaluationVerify.scala @@ -0,0 +1,42 @@ +package com.bigdata.compare.ml + +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.sql.SparkSession + +import java.io.FileWriter + +object UpEvaluationVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val sparkConf = new SparkConf().setAppName("UpEvaluationVerify") + val spark = SparkSession.builder.config(sparkConf).getOrCreate() + val isCorrect = compareRes(path0, path1, spark) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${isCorrect}:\n${path0}\n${path1}\n") + writerIsCorrect.close() + println(s"The algorithm is correct: ${isCorrect}") + + } + + def compareRes(path0: String, path1: String, spark: SparkSession): String = { + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val res1File = new Path(path0) + val res2File = new Path(path1) + if (!fs.exists(res1File) || !fs.exists(res2File)) { + return "invaildComparison" + } + val res1 = sc.textFile(path0).collect().head.toDouble + val res2 = sc.textFile(path1).collect().head.toDouble + if (res1 * (1 + 0.005) - res2 >= 0.0) { + return "true" + } + else { + return "false" + } + } + +} + diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/Word2VecEvaluation.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/Word2VecEvaluation.scala new file mode 100644 index 0000000..798c23e --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/Word2VecEvaluation.scala @@ -0,0 +1,131 @@ +package com.bigdata.compare.ml + +import com.github.fommil.netlib.BLAS.{getInstance => blas} +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.classification.LogisticRegression +import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics +import org.apache.spark.mllib.feature.Word2VecModel +import org.apache.spark.sql.{DataFrame, SparkSession} + +object Word2VecEvaluation extends Serializable { + + def meanVectors(vectors: Array[Array[Float]], startIdx: Int, endIdx: Int): Array[Float] = { + val vectorSize = vectors(0).length + val result = new Array[Float](vectorSize) + Range(startIdx, endIdx).foreach(i => blas.saxpy(vectorSize, 1.0f, vectors(i), 1, result, 1)) + blas.sscal(vectorSize, (1.0 / (endIdx - startIdx)).toFloat, result, 1) + result + } + + def meanVectors(vectors: Array[Array[Float]]): Array[Float] = { + meanVectors(vectors, 0, vectors.length) + } + + def cosDistance(x: Array[Double], y: Array[Double]): Double = { + var sim, normX, normY = 0.0 + for (i <- x.indices) { + sim += x(i) * y(i) + normX += math.pow(x(i), 2) + normY += math.pow(y(i), 2) + } + sim / math.sqrt(normX * normY) + } + + def evaluateTaobaoProductCTR( + spark: SparkSession, + downstreamTrainFile: String, + downstreamTestFile: String, + w2vModel: Word2VecModel, + pt: Int): Double = { + val sc = spark.sparkContext + val bcW2vModel = sc.broadcast(w2vModel.getVectors) + + // 构造特征: [u, v, |u - v|] + import spark.implicits._ + val trainData = sc.objectFile[(Double, Array[String])](downstreamTrainFile, pt) + .map { case (label, sentence) => + val localW2vModel = bcW2vModel.value + val featureVectors = sentence.map(localW2vModel) + val feature = meanVectors(featureVectors, 0, featureVectors.length - 1).map(_.toDouble) + val target = featureVectors.last.map(_.toDouble) + val distance = cosDistance(feature, target) + (label, Vectors.dense(feature ++ target ++ Array(distance))) + }.repartition(pt) + .toDF("label", "features") + .cache() + val testData = sc.objectFile[(Double, Array[String])](downstreamTestFile, pt) + .map { case (label, sentence) => + val featureVectors = sentence.map(bcW2vModel.value) + val feature = meanVectors(featureVectors, 0, featureVectors.length - 1).map(_.toDouble) + val target = featureVectors.last.map(_.toDouble) + val distance = cosDistance(feature, target) + (label, Vectors.dense(feature ++ target ++ Array(distance))) + }.repartition(pt) + .toDF("label", "features") + .cache() + + // 训练二分类模型 + val model = new LogisticRegression().fit(trainData) + trainData.unpersist() + + // 评测:ROC + val predictionAndLables = model.transform(testData) + .select("prediction", "label").rdd + .map(row => (row.getAs[Double](0), row.getAs[Double](1))) + new BinaryClassificationMetrics(predictionAndLables).areaUnderROC() + } + + def evaluateAlibabaCTR( + spark: SparkSession, + fieldNames: Array[String], + downstreamTrainFile: String, + downstreamTestFile: String, + w2vModels: Array[Word2VecModel], + pt: Int): Double = { + val sc = spark.sparkContext + val bcW2vModels = w2vModels.map(m => sc.broadcast(m.getVectors)) + val fieldNameIdMap = Map("item" -> 0, "cate" -> 1, "shop" -> 2, "node" -> 3, "product" -> 4, "brand" -> 5) + val bcFieldIds = sc.broadcast(fieldNames.map(fieldNameIdMap)) + + // 构造特征: [u, v, |u - v|] + val trainData = readAlibabaDownstreamData(spark, bcFieldIds, downstreamTrainFile, bcW2vModels, pt).cache() + val testData = readAlibabaDownstreamData(spark, bcFieldIds, downstreamTestFile, bcW2vModels, pt).cache() + + // 训练二分类模型 + val model = new LogisticRegression().fit(trainData) + trainData.unpersist() + + // 评测:ROC + val predictionAndLabels = model.transform(testData) + .select("prediction", "label").rdd + .map(row => (row.getAs[Double](0), row.getAs[Double](1))) + new BinaryClassificationMetrics(predictionAndLabels).areaUnderROC() + } + + def readAlibabaDownstreamData(spark: SparkSession, bcFieldIds: Broadcast[Array[Int]], downstreamFile: String, + bcW2vModels: Array[Broadcast[Map[String, Array[Float]]]], pt: Int): DataFrame = { + val sc = spark.sparkContext + val vectorSize = bcW2vModels(0).value.valuesIterator.next().length + + import spark.implicits._ + sc.objectFile[(Double, Array[(String, Array[String])])](downstreamFile, pt) + .map { case (label, fieldVectors) => + (label, bcFieldIds.value.map(id => fieldVectors(id))) + }.map { case (label, fieldVectors) => + val fieldNum = fieldVectors.length + val features = new Array[Double](fieldNum * (2 * vectorSize + 1)) + var offset = 0 + for (i <- 0 until fieldNum) { + val adVector = bcW2vModels(i).value(fieldVectors(i)._1).map(_.toDouble) + val historyVector = meanVectors(fieldVectors(i)._2.map(bcW2vModels(i).value)).map(_.toDouble) + System.arraycopy(adVector, 0, features, offset, vectorSize) + System.arraycopy(historyVector, 0, features, offset + vectorSize, vectorSize) + features(offset + 2 * vectorSize) = cosDistance(adVector, historyVector) + offset += 2 * vectorSize + 1 + } + (label, Vectors.dense(features)) + }.repartition(pt) + .toDF("label", "features") + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/BFSRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/BFSRunner.scala new file mode 100644 index 0000000..119c2a3 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/BFSRunner.scala @@ -0,0 +1,119 @@ +package com.bigdata.graph + +import java.io.{FileWriter, InputStreamReader} +import java.util + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.Graph +import org.apache.spark.graphx.lib.BFS +import org.apache.spark.rdd.RDD +import org.apache.spark.{SparkConf, SparkContext} + +class BFSConfig extends Serializable { + @BeanProperty var bfs: util.HashMap[String, Object] = _ +} + +class BFSParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var splitGraph: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var isDirect: Boolean = _ + @BeanProperty var sourceID: Long = _ + @BeanProperty var depthLimit: Int = _ + @BeanProperty var numPartitions: Int = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} + +object BFSRunner { + def main(args: Array[String]): Unit = { + try { + val datasetName: String = args(0) + val sourceID: Long = args(1).toLong + val numPartitions: Int = args(2).toInt + val isRaw: String = args(3) + val inputPath: String = args(4) + val outputPath: String = args(5) + + val stream: InputStreamReader = Utils.getStream("conf/graph/bfs/bfs.yml") + val representer = new Representer + representer.addClassTag(classOf[BFSParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[BFSConfig]), representer, options) + val description = new TypeDescription(classOf[BFSParams]) + yaml.addTypeDescription(description) + val config: BFSConfig = yaml.load(stream).asInstanceOf[BFSConfig] + val paramsMap: util.HashMap[String, Object] = + config.bfs + .get(datasetName) + .asInstanceOf[util.HashMap[String, Object]] + + val params = new BFSParams() + + val splitGraph: String = paramsMap.get("splitGraph").toString + val isDirect: Boolean = paramsMap.get("isDirect").toString.toBoolean + val depthLimit: Int = paramsMap.get("depthLimit").toString.toInt + + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setSplitGraph(splitGraph) + params.setDatasetName(datasetName) + params.setNumPartitions(numPartitions) + params.setIsDirect(isDirect) + params.setSourceID(sourceID) + params.setDepthLimit(depthLimit) + params.setIsRaw(isRaw) + params.setAlgorithmName("BFS") + params.setTestcaseType(s"BFS_${datasetName}_${sourceID}") + println("inputPath: " + inputPath) + println("outputPath: " + outputPath) + + val appName = s"BFS_${datasetName}_${sourceID}" + val sparkConf: SparkConf = new SparkConf() + .setAppName(appName) + .setMaster("yarn") + val sc = new SparkContext(sparkConf) + + // record start time + val startTime: Long = System.currentTimeMillis() + val inputRdd: RDD[(String, String)] = + Util.readUndirectDataFromHDFS(sc, inputPath, splitGraph, numPartitions) + val graph: Graph[Int, Int] = Util.buildUnweightedGraph(inputRdd, isDirect) + + val result: RDD[String] = + BFS + .run(graph, sourceID, isDirect, depthLimit) + .vertices + .filter(_._2._1 != Integer.MAX_VALUE) + .map(f => f._1.toString + ";" + f._2._2.mkString(",") + ";" + f._2._1) + + Util.saveDataToHDFS(result, outputPath) + + val costTime: Double = (System.currentTimeMillis() - startTime) / 1000.0 + params.setCostTime(costTime) + + Utils.checkDirs("report") + val writer = new FileWriter( + s"report/BFS_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + + println(s"Exec Successful: costTime: ${costTime}s") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } + +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/BetweennessRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/BetweennessRunner.scala new file mode 100644 index 0000000..dc043bb --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/BetweennessRunner.scala @@ -0,0 +1,110 @@ +package com.bigdata.graph + +import java.io.{FileWriter, InputStreamReader} +import java.util + +import scala.beans.BeanProperty + +import com.bigdata.compare.graph.BetweennessClosenessVerify +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.lib.Betweenness +import org.apache.spark.{SparkConf, SparkContext} + +class BetweennessConfig extends Serializable{ + @BeanProperty var betweenness: util.HashMap[String, Object] = _ +} + +class BetweennessParams extends Serializable{ + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var splitGraph: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var computePartitions: Int = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var k: Int = _ + @BeanProperty var p: Float = _ + @BeanProperty var computeTopK = "no" + @BeanProperty var groundTruthPath: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var accuracy: Double = _ +} + +object BetweennessRunner { + + def main(args: Array[String]): Unit = { + try { + val datasetName = args(0) + val isRaw = args(1) + val partition = args(2).toInt + val inputPath = args(3) + val check = args(4) + val outputPath = args(5) + val groundTruthPath = args(6) + + val representer = new Representer + representer.addClassTag(classOf[BetweennessParams], Tag.MAP) + + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val stream: InputStreamReader = Utils.getStream("conf/graph/betweenness/betweenness.yml") + val yaml = new Yaml(new Constructor(classOf[BetweennessConfig]), representer, options) + val description = new TypeDescription(classOf[BetweennessParams]) + yaml.addTypeDescription(description) + val config: BetweennessConfig = yaml.load(stream).asInstanceOf[BetweennessConfig] + val paramsMap: util.HashMap[String, Object] = + config.betweenness + .get(datasetName) + .asInstanceOf[util.HashMap[String, Object]] + + val params = new BetweennessParams() + + params.setDatasetName(datasetName) + params.setInputPath(inputPath) + params.setK(paramsMap.get("k").toString.toInt) + params.setP(paramsMap.get("p").toString.toFloat) + params.setComputeTopK(check) + params.setIsRaw(isRaw) + params.setComputePartitions(partition) + params.setOutputPath(outputPath) + params.setSplitGraph(paramsMap.get("splitGraph").toString) + params.setGroundTruthPath(groundTruthPath) + params.setAlgorithmName("Betweenness") + params.setTestcaseType(s"Betweenness_${datasetName}") + val conf = new SparkConf().setAppName(s"Betweenness_${datasetName}") + val sc = new SparkContext(conf) + + val startTime = System.currentTimeMillis() + + val edgeRDD = Util.readEdgeListFromHDFS(sc, inputPath, params.getSplitGraph, isWeighted = false, partition) + + val result = Betweenness.run(edgeRDD, params.getK, params.getP) + Util.saveDataToHDFS(result, ",", outputPath) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + + params.setCostTime(costTime) + println(s"Betweenness_${datasetName} Exec Successful: costTime: ${costTime}") + if (check.equals("yes")) { + val acc = BetweennessClosenessVerify.main(Array(groundTruthPath, outputPath, partition.toString)) + params.setAccuracy(acc) + } + + Utils.checkDirs("report") + val writer = new FileWriter( + s"report/Betweenness_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } + +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/ClosenessHiveRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/ClosenessHiveRunner.scala new file mode 100644 index 0000000..22f8542 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/ClosenessHiveRunner.scala @@ -0,0 +1,85 @@ +package com.bigdata.graph + +import org.apache.spark.graphx.lib.Closeness +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.types.{DoubleType, LongType, StringType, StructField, StructType} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.SparkConf + +object ClosenessHiveRunner { + def main(args: Array[String]): Unit = { + if (args.length < 10) { + println(args.mkString(",")) + println("Usage:KCoreDecompositionRunner

") + System.exit(-1) + } + val tableName: String = args(0) + val col1: String = args(1) + val col2: String = args(2) + val colWeight: String = args(3) + val weighted: Boolean = args(4).toBoolean + val k: Int = args(5).toInt + val p: Double = args(6).toDouble + val partition: Int = args(7).toInt + val saveMode: String = args(8) + val saveArg: String = args(9) + + try { + val appName = s"Closeness_${tableName}_${weighted}" + val sparkConf: SparkConf = new SparkConf() + .setAppName(appName) + .setMaster("yarn") + val spark: SparkSession = SparkSession.builder().enableHiveSupport().config(sparkConf).getOrCreate() + // record start time + val startTime: Long = System.currentTimeMillis() + + val sql = s"select * from ${tableName}" + var edgesRDD: RDD[(Long, Long, Double)] = null + var verticesRDD: RDD[(String, Long)] = null + if (colWeight == "none") { + val edges: DataFrame = spark.sql(sql).select(col1, col2) + val tmpRDD1: RDD[(String, String)] = edges.rdd.map(row => (row(0).toString, row(1).toString)) + verticesRDD = tmpRDD1.flatMap(f => Iterator(f._1, f._2)).distinct.zipWithIndex.cache + verticesRDD.foreachPartition(_ => {}) + println("vertices count:" +verticesRDD.count()) + edgesRDD = tmpRDD1.leftOuterJoin(verticesRDD, partition).map(f => (f._2._1, f._2._2.get)).leftOuterJoin(verticesRDD, partition).map(f => (f._2._1, f._2._2.get, 1.0)) + } else { + val edges: DataFrame = spark.sql(sql).select(col1, col2, colWeight) + val tmpRDD: RDD[(String, (String, Double))] = edges.rdd.map(row => (row(0).toString, (row(1).toString, row(2).toString.toDouble))) + verticesRDD = tmpRDD.flatMap(f => Iterator(f._1, f._2._1)).distinct().zipWithIndex().cache() + verticesRDD.foreachPartition(_ => {}) + println("vertices count:" + verticesRDD.count()) + edgesRDD = tmpRDD.leftOuterJoin(verticesRDD, partition).map(f => (f._2._1._1, (f._2._2.get, f._2._1._2))).leftOuterJoin(verticesRDD, partition).map(f => (f._2._1._1, f._2._2.get, f._2._1._2)) + } + + val result: RDD[(Long, Double)] = + Closeness.run(edgesRDD, weighted, k, p) + val finalResult = result.leftOuterJoin(verticesRDD.map(_.swap), 200).map(f => Row(f._2._2.get, f._2._1)) + + val _ = saveMode match { + case "hive" => { + val schema_resultMap: StructType = StructType(List(StructField("_node_id", StringType, true), StructField("_closeness", DoubleType, true))) + val resultMapDF: DataFrame = spark.createDataFrame(finalResult, schema_resultMap) + resultMapDF.createOrReplaceTempView("ClosenessMapTmpV") + val outputTableName: String = saveArg + val createSql = s"create table ${outputTableName}_closeness(_node_id varchar(250), _closeness double)" + spark.sql(createSql) + spark.sql(s"insert into ${outputTableName}_closeness select * from ClosenessMapTmpV") + } + case "hdfs" => { + val outputPath: String = saveArg + Util.saveDataToHDFS(result, ",", outputPath) + } + case _ => throw new Exception("illegal save mode") + } + + val costTime: Double = (System.currentTimeMillis() - startTime) / 1000.0 + println(s"Exec Successful: costTime: ${costTime}s") + spark.close() + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/ClosenessRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/ClosenessRunner.scala new file mode 100644 index 0000000..099e0c4 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/ClosenessRunner.scala @@ -0,0 +1,122 @@ +package com.bigdata.graph + +import java.io.FileWriter +import java.util + +import scala.beans.BeanProperty + +import com.bigdata.compare.graph.BetweennessClosenessVerify +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.lib.Closeness +import org.apache.spark.{SparkConf, SparkContext} + +class ClosenessConfig extends Serializable { + @BeanProperty var closeness: util.HashMap[String, Object] = _ +} + +class ClosenessParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var splitGraph: String = _ + @BeanProperty var weighted: String = _ + @BeanProperty var outputNodeNum: Int = _ + @BeanProperty var ratio: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var numPartitions: Int = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var accuracy: Double = _ +} + +object ClosenessRunner { + + def main(args: Array[String]): Unit = { + try { + val datasetName = args(0) + val numPartitions = args(1).toInt + val weighted = args(2) + val outputNodeNum = args(3).toInt + val ratio = args(4).toDouble + val isRaw = args(5) + val inputPath = args(6) + val outputPath = args(7) + val split = args(8) + val groundTruthPath = args(9) + val check = args(10) + + val params = new ClosenessParams() + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setSplitGraph(split) + params.setDatasetName(datasetName) + params.setNumPartitions(numPartitions) + params.setWeighted(weighted) + params.setOutputNodeNum(outputNodeNum) + params.setRatio(ratio) + params.setIsRaw(isRaw) + params.setAlgorithmName("Closenness") + params.setTestcaseType(s"Closenness_${datasetName}_${weighted}") + + println("inputPath: " + inputPath) + println("outputPath: " + outputPath) + + val weightedBool = weighted match { + case "weighted" => true + case "unweighted" => false + case _ => throw new Exception("illegal weighted value") + } + + val appName = s"Closeness_${datasetName}_${weighted}" + val sparkConf = new SparkConf() + .setAppName(appName) + .setMaster("yarn") + val sc = new SparkContext(sparkConf) + + // record start time + val startTime = System.currentTimeMillis() + + val edgeRDD = + Util.readEdgeListFromHDFS(sc, + inputPath, + split, + weightedBool, + numPartitions) + val result = + Closeness.run(edgeRDD, weightedBool, outputNodeNum, ratio) + Util.saveDataToHDFS(result, ",", outputPath) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setCostTime(costTime) + + Utils.checkDirs("report") + val writer = new FileWriter( + s"report/Closeness_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + + val representer = new Representer + representer.addClassTag(classOf[ClosenessParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[ClosenessConfig]), representer, options) + val description = new TypeDescription(classOf[ClosenessParams]) + yaml.addTypeDescription(description) + yaml.dump(params, writer) + + println(s"Exec Successful: costTime: ${costTime}s") + if (check.equals("yes")) { + val acc = BetweennessClosenessVerify.main(Array(groundTruthPath, outputPath, numPartitions.toString)) + params.setAccuracy(acc) + } + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/ClusteringCoefficientRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/ClusteringCoefficientRunner.scala new file mode 100644 index 0000000..898b239 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/ClusteringCoefficientRunner.scala @@ -0,0 +1,140 @@ +package com.bigdata.graph + +import java.io.{FileWriter, InputStreamReader} +import java.util + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.Graph +import org.apache.spark.graphx.lib.ClusteringCoefficient +import org.apache.spark.{SparkConf, SparkContext} +class clusteringCoefficientConfig extends Serializable { + @BeanProperty var clusteringCoefficient: util.HashMap[String, util.HashMap[String, Object]] = _ +} +class clusteringCoefficientParms extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var isDirect: Boolean = _ + @BeanProperty var isWeight: String = _ + @BeanProperty var splitGraph: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var computePartitions: Int = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var LocalClusteringCoefficient: AnyVal = _ + @BeanProperty var AverageClusteringCoefficient: AnyVal = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} +object ClusteringCoefficientRunner { + + def main(args: Array[String]): Unit = { + try { + val datsetName = args(0) + val computePartitions = args(1).toInt + val isWeight = args(2) + val isRaw = args(3) + val inputPath = args(4) + val api = args(5) + val outputPath = args(6) + + val weightedBool = isWeight match { + case "weighted" => true + case "unweighted" => false + case _ => throw new Exception("illegal weighted value") + } + + val stream: InputStreamReader = Utils.getStream("conf/graph/clusteringcoefficient/clusteringcoefficient.yml") + val representer = new Representer + representer.addClassTag(classOf[clusteringCoefficientParms], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[clusteringCoefficientConfig]), representer, options) + val description = new TypeDescription(classOf[clusteringCoefficientParms]) + yaml.addTypeDescription(description) + val config: clusteringCoefficientConfig = yaml.load(stream).asInstanceOf[clusteringCoefficientConfig] + val paramsMap: util.HashMap[String, Object] = config.clusteringCoefficient + .get(isRaw match { + case "no" => "opt" + case _ => "raw" + }) + .get(datsetName) + .asInstanceOf[util.HashMap[String, Object]] + + val params = new clusteringCoefficientParms + + val splitGraph: String = paramsMap.get("splitGraph").toString + val isDirect: Boolean = paramsMap.get("isDirect").toString.toBoolean + + params.setDatasetName(datsetName) + params.setComputePartitions(computePartitions) + params.setIsWeight(isWeight) + params.setInputPath(inputPath) + params.setIsRaw(isRaw) + params.setApiName(api) + params.setOutputPath(outputPath) + params.setSplitGraph(splitGraph) + params.setIsDirect(isDirect) + params.setAlgorithmName("ClusteringCoefficient") + params.setTestcaseType(s"ClusteringCoefficient_${datsetName}_${api}_${isWeight}") + + println("inputPath:" + inputPath) + println("outputPath:" + outputPath) + + val appName = s"ClusteringCoefficient_${api}_${isWeight}_${datsetName}" + + val sparkConf = new SparkConf() + .setAppName(appName) + .setMaster("yarn") + val sc = new SparkContext(sparkConf) + + val startTime: Long = System.currentTimeMillis() + + val inputRDD = Util.readUndirectDataFromHDFS(sc, inputPath, splitGraph, params.getComputePartitions) + .map(f => (f._1.toLong, f._2.toLong)) + + val graph = if (weightedBool) { + Graph.fromEdgeTuples(inputRDD, 0.0).mapEdges(f => 1.0) + } else { + Graph.fromEdgeTuples(inputRDD, 0.0) + } + + val result = api match { + case "lcc" => + val result = ClusteringCoefficient + .runLocalClusteringCoefficient(graph, isDirect, weightedBool).vertices + Util.saveDataToHDFS(result, ",", params.outputPath) + case "avgcc" => + val result: Double = ClusteringCoefficient + .runAverageClusteringCoefficient(graph, isDirect, weightedBool) + params.setAverageClusteringCoefficient(result) + case "globalcc" => + val result: Double = ClusteringCoefficient.runGlobalClusteringCoefficient(graph) + params.setLocalClusteringCoefficient(result) + case _ => throw new Exception("illegal api") + } + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + println(s"Exec Successful: costTime: ${costTime}") + params.setCostTime(costTime) + + Utils.checkDirs("report") + val writer = new FileWriter( + s"report/ClusteringCoefficient_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + + sc.stop() + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/ConnectedComponentsRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/ConnectedComponentsRunner.scala new file mode 100644 index 0000000..82dd727 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/ConnectedComponentsRunner.scala @@ -0,0 +1,100 @@ +package com.bigdata.graph + +import java.io.FileWriter +import java.util.{HashMap => JHashMap} + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.Graph +import org.apache.spark.graphx.lib.ConnectedComponents +import org.apache.spark.{SparkConf, SparkContext} + +class CCParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var partition = new JHashMap[String, Int] + @BeanProperty var split = new JHashMap[String, String] + + @BeanProperty var datasetName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var curPartition: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} + +object ConnectedComponentsRunner { + private val RESULT_SPLIT = "," + private val PARAM_FILEPATH = "conf/graph/cc/cc.yml" + + def main(args: Array[String]): Unit = { + if (args.length < 5) { + println(args.mkString(",")) + println("Usage:ConnectedComponentsRunner ") + System.exit(-1) + } + val dataset = args(0) + val inputPath = args(1) + val outputPath = args(2) + val isRaw = args(3) + val cpuName = args(4) + + val representer = new Representer + representer.addClassTag(classOf[CCParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[CCParams]), representer, options) + val description = new TypeDescription(classOf[CCParams]) + yaml.addTypeDescription(description) + val params = yaml.load(Utils.getStream(PARAM_FILEPATH)).asInstanceOf[CCParams] + val partition = params.getPartition.get(s"${dataset}_${cpuName}_${isRaw}") + val split = params.getSplit.get(dataset) + val appName = if ("yes".equals(isRaw)) { + s"CC_RAW_${dataset}_${cpuName}" + } else { + s"CC_${dataset}_${cpuName}" + } + + try { + val sc = new SparkContext(new SparkConf().setAppName(appName)) + val startTime = System.currentTimeMillis() + + val inputRdd = Util.readUndirectDataFromHDFS(sc, inputPath, split, partition).map(x => (x._1.toLong, x._2.toLong)) + val graph = Graph.fromEdgeTuples(inputRdd, 0) + val result = ConnectedComponents.run(graph).vertices + Util.saveDataToHDFS(result, RESULT_SPLIT, outputPath) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setCostTime(costTime) + + println(s"Exec Successful: connected components costTime: ${costTime}s") + + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setDatasetName(dataset) + params.setIsRaw(isRaw) + params.setCurPartition(s"$partition") + params.setAlgorithmName("CC") + params.setTestcaseType(appName) + + Utils.checkDirs("report") + val writer = new FileWriter(s"report/CC_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + + sc.stop() + } catch { + case e: Throwable => + println(s"Exec Failure:${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/CycleDetectionWithConstrainsRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/CycleDetectionWithConstrainsRunner.scala new file mode 100644 index 0000000..3b22241 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/CycleDetectionWithConstrainsRunner.scala @@ -0,0 +1,122 @@ +package com.bigdata.graph + +import java.io.FileWriter +import java.util.{HashMap => JHashMap} + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.lib.CycleDetectionWithConstrains +import org.apache.spark.{SparkConf, SparkContext} + +class CDParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var partition = new JHashMap[String, Int] + @BeanProperty var split = new JHashMap[String, String] + @BeanProperty var minLoopLen: Int = _ + @BeanProperty var maxLoopLen: Int = _ + @BeanProperty var minRate: Double = _ + @BeanProperty var maxRate: Double = _ + + @BeanProperty var datasetName: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} + +object CycleDetectionWithConstrainsRunner { + private val CD_PARAM_FILEPATH = "conf/graph/cd/cd.yml" + + def main(args: Array[String]): Unit = { + if (args.length < 6) { + println(args.mkString(",")) + println("Usage:CycleDetectionWithConstrainsRunner ") + System.exit(-1) + } + val dataset = args(0) + val inputPath = args(1) + val outputPath = args(2) + val api = args(3) + val isRaw = args(4) + val cpuName = args(5) + + val representer = new Representer + representer.addClassTag(classOf[CDParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[CDParams]), representer, options) + val description = new TypeDescription(classOf[CDParams]) + yaml.addTypeDescription(description) + val params = yaml.load(Utils.getStream(CD_PARAM_FILEPATH)).asInstanceOf[CDParams] + val split = params.getSplit.get(dataset) + val partition = params.getPartition.get(s"${dataset}_${cpuName}_${isRaw}") + val appName = s"CD_${dataset}_${api}_${cpuName}" + try { + val sc = new SparkContext(new SparkConf().setAppName(appName)) + val startTime = System.currentTimeMillis() + + val input = dataset match { + case "usaRoad" => + sc.textFile(inputPath, partition) + .filter(_.startsWith("a")) + .flatMap{x => + val lines = x.split(split) + if (lines.length != 4) { + Iterator.empty + } else { + Iterator.single((lines(1).toLong, lines(2).toLong, lines(3).toDouble)) + } + + } + case _ => + sc.textFile(inputPath, partition) + .filter(!_.startsWith("#")) + .flatMap{x => + val lines = x.split(split) + if (lines.length != 3) { + Iterator.empty + } else { + Iterator.single((lines(0).toLong, lines(1).toLong, lines(2).toDouble)) + } + + } + } + val result = CycleDetectionWithConstrains.run(input, partition, params.minLoopLen, + params.maxLoopLen, params.minRate, params.maxRate) + + Util.saveDataToHDFS(result.map(_.mkString(",")), outputPath) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setCostTime(costTime) + params.setDatasetName(dataset) + params.setApiName(api) + params.setIsRaw(isRaw) + params.setAlgorithmName("CD") + params.setTestcaseType(s"CD_${dataset}") + + Utils.checkDirs("report") + val writer = new FileWriter(s"report/CD_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: cycle detection costTime: ${costTime}s") + + sc.stop() + } catch { + case e: Throwable => + println(s"Exec Failure:${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/DeepWalkRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/DeepWalkRunner.scala new file mode 100644 index 0000000..ad82c11 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/DeepWalkRunner.scala @@ -0,0 +1,111 @@ +// scalastyle:off + +package com.bigdata.graph + +import com.bigdata.utils.Utils +import org.apache.spark.graphx.lib.{DeepWalk, Parameters} +import org.apache.spark.{SparkConf, SparkContext} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import scala.beans.BeanProperty +import java.io.{File, FileWriter, InputStreamReader} +import java.util + +class DeepWalkConfig extends Serializable { + @BeanProperty var deepwalk: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class DeepWalkParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var splitGraph: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var partitions: Int = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var walkLength: Int = _ + @BeanProperty var numWalks: Int = _ + @BeanProperty var iteration: Int = _ + @BeanProperty var dimension: Int = _ + @BeanProperty var windowSize: Int = _ + @BeanProperty var negativeSample: Int = _ +} + +object DeepWalkRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, platformName) = (modelConfSplit(0), modelConfSplit(1)) + val inputPath = args(1) + val outputPath = args(2) + val isRaw = args(3) + + val representer = new Representer + representer.addClassTag(classOf[DeepWalkParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val stream: InputStreamReader = Utils.getStream("conf/graph/deepwalk/deepwalk.yml") + val yaml = new Yaml(new Constructor(classOf[DeepWalkConfig]), representer, options) + val description = new TypeDescription(classOf[DeepWalkParams]) + yaml.addTypeDescription(description) + val config: DeepWalkConfig = yaml.load(stream).asInstanceOf[DeepWalkConfig] + + val paramsMap = + config.deepwalk.get(datasetName).get(isRaw match { + case "no" => "opt" + case _ => "raw" + }).asInstanceOf[util.HashMap[String, Object]] + + val params = new DeepWalkParams() + + params.setDatasetName(datasetName) + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setPartitions(paramsMap.get("partitions").toString.toInt) + params.setWalkLength(paramsMap.get("walkLength").toString.toInt) + params.setNumWalks(paramsMap.get("numWalks").toString.toInt) + params.setNegativeSample(paramsMap.get("negativeSample").toString.toInt) + params.setIteration(paramsMap.get("iteration").toString.toInt) + params.setDimension(paramsMap.get("dimension").toString.toInt) + params.setWindowSize(paramsMap.get("windowSize").toString.toInt) + params.setSplitGraph(paramsMap.get("splitGraph").toString) + params.setAlgorithmName("DeepWalk") + params.setTestcaseType(s"${params.algorithmName}_${datasetName}_${isRaw}") + + val conf = new SparkConf().setAppName(params.testcaseType) + val sc = new SparkContext(conf) + + val startTime = System.currentTimeMillis() + + val edgeRDD = Util.readCommFromHDFS(sc, inputPath, params.getSplitGraph, params.getPartitions) + + val deepwalkParams = Parameters(params.getWalkLength, params.getNumWalks, params.getIteration, params.getDimension, params.getWindowSize, params.getNegativeSample) + + val deepwalkModel = DeepWalk.run(edgeRDD,deepwalkParams) + Util.saveNode2VecModel(deepwalkModel, params.getOutputPath) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + + params.setCostTime(costTime) + println(s"Exec Successful: costTime: ${costTime}") + + val folder = new File("report") + if (!folder.exists()) { + val mkdir = folder.mkdirs() + println(s"Create dir report ${mkdir}") + } + val writer = new FileWriter( + s"report/deepWalk_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/DegreeRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/DegreeRunner.scala new file mode 100644 index 0000000..061fb02 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/DegreeRunner.scala @@ -0,0 +1,112 @@ +package com.bigdata.graph + +import java.io.FileWriter +import java.util + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.Graph +import org.apache.spark.{SparkConf, SparkContext} + +class DegreeConfig extends Serializable { + @BeanProperty var degree: util.HashMap[String, Object] = _ +} + +class DegreeParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var splitGraph: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var numPartitions: Int = _ + @BeanProperty var apiName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + +} + +object DegreeRunner { + def main(args: Array[String]): Unit = { + try { + val datasetName = args(0) + val api = args(1) + val numPartitions = args(2).toInt + val isRaw = args(3) + val inputPath = args(4) + val outputPath = args(5) + + val stream = Utils.getStream("conf/graph/degree/degree.yml") + val representer = new Representer + representer.addClassTag(classOf[DegreeParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[DegreeConfig]), representer, options) + val description = new TypeDescription(classOf[DegreeParams]) + yaml.addTypeDescription(description) + val config: DegreeConfig = yaml.load(stream).asInstanceOf[DegreeConfig] + val paramsMap = + config.degree + .get(datasetName) + .asInstanceOf[util.HashMap[String, Object]] + val splitGraph = paramsMap.get("splitGraph").toString + + val params = new DegreeParams() + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setSplitGraph(splitGraph) + params.setDatasetName(datasetName) + params.setNumPartitions(numPartitions) + params.setApiName(api) + params.setIsRaw(isRaw) + params.setAlgorithmName("Degree") + + println("inputPath: " + inputPath) + println("outputPath: " + outputPath) + + var appName = s"Degree_${datasetName}_${api}" + if (isRaw == "yes") { + appName = s"Degree_${datasetName}_${api}_raw" + } + val sparkConf = new SparkConf() + .setAppName(appName) + .setMaster("yarn") + val sc = new SparkContext(sparkConf) + params.setTestcaseType(appName) + + // record start time + val startTime = System.currentTimeMillis() + val edgeInfo = + Util.readDataFromHDFSForDegree(sc, inputPath, splitGraph, numPartitions) + val graph = Graph.fromEdgeTuples(edgeInfo, 0) + val result = api match { + case "degrees" => graph.degrees + case "inDegrees" => graph.inDegrees + case "outDegrees" => graph.outDegrees + case _ => throw new Exception("illegal api") + } + + Util.saveDataToHDFS(result, ",", outputPath) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setCostTime(costTime) + + Utils.checkDirs("report") + val writer = new FileWriter( + s"report/Degree_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + + println(s"Exec Successful: costTime: ${costTime}s") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/FraudarRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/FraudarRunner.scala new file mode 100644 index 0000000..938628f --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/FraudarRunner.scala @@ -0,0 +1,324 @@ +// scalastyle:off + +package com.bigdata.graph +import com.bigdata.utils.Utils + +import org.apache.spark.graphx.lib.{Fraudar, Parameters} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.storage.StorageLevel +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.io.{BufferedWriter, File, FileWriter, InputStreamReader} +import java.util +import scala.beans.BeanProperty +import scala.collection.mutable +import scala.collection.mutable.Map + +class FraudarConfig extends Serializable { + @BeanProperty var fraudar: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class FraudarParams extends Serializable { + @BeanProperty var splitGraph: String = _ + @BeanProperty var partitions: Int = _ + + @BeanProperty var iSetOutPath: String = _ + @BeanProperty var jSetOutPath: String = _ + @BeanProperty var dataPath: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} + + +object FraudarRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, platformName, isRaw) = (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + val dataPath = args(1) + val iSetOutPath = args(2) + val jSetOutPath = args(3) + val representer = new Representer + representer.addClassTag(classOf[FraudarParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val stream: InputStreamReader = Utils.getStream("conf/graph/fraudar/fraudar.yml") + val yaml = new Yaml(new Constructor(classOf[FraudarConfig]), representer, options) + val description = new TypeDescription(classOf[FraudarParams]) + yaml.addTypeDescription(description) + val config: FraudarConfig = yaml.load(stream).asInstanceOf[FraudarConfig] + + val params = new FraudarParams() + val paramsMap = + config.fraudar.get(isRaw match { + case "no" => "opt" + case _ => "raw" + }).get(datasetName).asInstanceOf[util.HashMap[String, Object]] + params.setSplitGraph(paramsMap.get("splitGraph").toString) + params.setPartitions(paramsMap.get("partitions").toString.toInt) + params.setDatasetName(datasetName) + params.setDataPath(dataPath) + params.setOutputPath(outputPath) + params.setIsRaw(isRaw) + params.setAlgorithmName("Fraudar") + params.setTestcaseType(s"${params.algorithmName}_${datasetName}_${isRaw}") + + val conf = new SparkConf().setAppName(params.testcaseType) + val spark = SparkSession.builder.config(conf).getOrCreate() + val costTime = isRaw match { + case "no" => new FraudarKernel().runOptJob(spark, params) + case "yes" => new FraudarKernel().runRawJob(spark, params) + } + + params.setCostTime(costTime) + println(s"Exec Successful: costTime: ${costTime}s") + + val folder = new File("report") + if (!folder.exists()) { + val mkdir = folder.mkdirs() + println(s"Create dir report ${mkdir}") + } + val writer = new FileWriter( + s"report/${params.testcaseType}_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class FraudarKernel { + def runOptJob(spark: SparkSession, params: FraudarParams): Double = { + val sc = spark.sparkContext + sc.setLogLevel("WARN") + val startTime = System.currentTimeMillis() + val bipartGraph = Util.readUndirectDataFromHDFS(sc, params.dataPath, params.splitGraph, params.partitions) + .map(f => (f._1.toLong, f._2.toLong)) + .persist(StorageLevel.MEMORY_ONLY_SER) + bipartGraph.foreachPartition(f => {}) + + val res = Fraudar.runFraudar(bipartGraph) + res.map(f => f._1).distinct().saveAsTextFile(params.iSetOutPath) + res.map(f => f._2).distinct().saveAsTextFile(params.jSetOutPath) + + val finishTime = System.currentTimeMillis() + val costTime = (finishTime - startTime) / 1000 + costTime + } + + def runRawJob(spark: SparkSession, params: FraudarParams): Double = { + val sc = spark.sparkContext + sc.setLogLevel("WARN") + val startTime = System.currentTimeMillis() + val bipartGraph = Util.readUndirectDataFromHDFS(sc, params.dataPath, params.splitGraph, params.partitions) + .persist(StorageLevel.MEMORY_AND_DISK_SER) + bipartGraph.foreachPartition(f => {}) + + val res = runFraudar(sc, bipartGraph) + outputResult(params.iSetOutPath, params.jSetOutPath, res.toSet) + + val finishTime = System.currentTimeMillis() + val costTime = (finishTime - startTime) / 1000 + costTime + } + + /** + * 开源 + * https://github.com/XinyaZhao/Social-Network-Fraud-Detection/blob/master/Analytics%20Code/Code_to_Analyze/frauder_v4.scala + */ + /* get degree from a key-values pair */ + def getDegree(s: Tuple2[Long, Iterable[(Long, Long)]]): List[String] = { + var myList = List[String]() + for (e <- s._2) { + val crt = "i" + e._2 + " j" + s._1 + " " + s._2.size //i dst点 j src点 " " scr的出度大小 + myList = crt :: myList + } + myList.reverse + } + + /* cost function for column-weighting */ + def getCost(s: String): Double = { + val degree = s.toDouble + 1 / (math.log(degree + 5)) + } + + def flat_degree(s: String): List[(String, Double)] = { + var l = List[(String, Double)]() + val i = s.split(" ")(0) + val j = s.split(" ")(1) + val cij = s.split(" ")(2) + l = (i, getCost(cij)) :: l + l = (j, getCost(cij)) :: l + l.reverse + } + + /* get a scala mutable map from a RDD which stores the "i, j, degree of ij" */ + def getCostMap(rdd: RDD[String]): RDD[(String, Double)] = { + rdd.flatMap(flat_degree).reduceByKey((sum, n) => (sum + n)) + } + + /* Calcuate the f value of the whole set */ + def getSetValue(c: RDD[(String, Double)]): Double = { + if (c.count != 0) { + val v = c.reduce((a, b) => (" ", (a._2 + b._2)))._2 + v / 2 + } + else { + 0.00 + } + } + + /* get the vertex with minimum cost */ + def getDeleted(c: RDD[(String, Double)]): String = { + if (c.count != 0) { + val deleted = c.min()(new Ordering[Tuple2[String, Double]]() { + override def compare(x: (String, Double), y: (String, Double)): Int = + Ordering[Double].compare(x._2, y._2) + })._1 + //println("deleted:------------------------- " + deleted) + deleted + } + else { + " " + } + } + + /* update each line with a deleted vertex */ + def update(sc:SparkContext, degree: RDD[String], d: String): RDD[String] = { + var new_array = degree.collect; + var tmp = new_array.to[mutable.ArrayBuffer] + if (d.contains("j")) { + new_array = new_array.filterNot(s => s.split(" ")(1) == d) + tmp = new_array.to[mutable.ArrayBuffer] + } + if (d.contains("i")) { + var update_j = List[String]() + for (s <- new_array) { + if (s.split(" ")(0) == d) { + update_j = s.split(" ")(1) :: update_j + tmp -= s + } + } + + val tmp_buffer = tmp.toArray + // need a tmp buffert to deletee tmp + for (j <- update_j) { + for (s <- tmp_buffer) { + if (s.split(" ")(1) == j) { + tmp -= s + val iszero = s.split(" ")(2).toInt - 1 + if (iszero != 0) { + val new_line = s.split(" ")(0) + " " + s.split(" ")(1) + " " + (s.split(" ")(2).toInt - 1).toString + tmp -= s + tmp += new_line + } + } + } + } + } + sc.parallelize(tmp) + } + + /* get graph from cost array*/ + def getGraph(degree: RDD[String]): List[String] = { + var g = List[String]() + for (c <- degree.collect) { + if (!g.contains(c.split(" ")(0))) { + g = c.split(" ")(0) :: g + } + if (!g.contains(c.split(" ")(1))) { + g = c.split(" ")(1) :: g + } + } + g + } + + /* iterative delete a vertex */ + def greedyDecreasing(sc:SparkContext, degree: RDD[String]): List[String] = { + val cost = getCostMap(degree) + val value = getSetValue(cost) / cost.count + var valueMap = Map[Int, Double]() + valueMap += (0 -> value) + var graph = List[List[String]]() + graph = getGraph(degree) :: graph + var new_degree = degree + var c = cost + var a = 0 + while (c.count != 0) { // not cost size, need to be the number of vertex + val iter1 = System.currentTimeMillis() + println("c.count : " + c.count) + a = a + 1 + val d = getDeleted(c) + new_degree = update(sc, new_degree, d) + //newDegree = update(deleted) //update the degree of remaining i and j based on the deteled vertex + c = getCostMap(new_degree) + graph = getGraph(new_degree) :: graph + val value = getSetValue(c) / c.count // the set vaule should be divided by the |C| + //println("value : " + value) + //new_degree.foreach(println) + //println(getGraph(c)) + valueMap += (a -> value) + + val iter2 = System.currentTimeMillis() + println(s"iterNum:${a}, updatetime: "+ ((iter2 - iter1) / 1000.0) + " sec") + } + var max_index = -1 + var max_Value = -1.000 + for (s <- valueMap) { + if (s._2 > max_Value) { + max_index = s._1 + max_Value = s._2 + } + } + //println("maxvalue" + " " + max_Value + " index:" + max_index) + //graph.reverse.foreach(println) + val objectGraph = graph.reverse(max_index) + + //objectGraph.foreach(f=>println(f)) + objectGraph + } + + /* get the most density graph*/ + def getFinalSet(cst: Map[String, Double], dgr: List[String]): Set[String] = { + var set = Set[String]() + for (e <- cst) { + set += (e._1) + } + set -- dgr.toSet + } + + def outputResult(iset_out:String, jset_out:String, set: Set[String]): Unit = { + val ibf = new BufferedWriter(new FileWriter(iset_out)); + val jbf = new BufferedWriter(new FileWriter(jset_out)); + val sorted_list = set.toList.sortWith(_.substring(1).toLong < _.substring(1).toLong) + for (s <- sorted_list) { + if (s.contains("i")) { + ibf.write(s + "\n"); + } + else { + jbf.write(s + "\n"); + } + } + ibf.flush() + jbf.flush() + ibf.close() + jbf.close() + } + + def runFraudar(sc:SparkContext, bipartGraph: RDD[(Long, Long)]):List[String] = { + val pairs = bipartGraph.map(x => (x._2, x._1)) + val group = pairs.groupBy(x => x._1) + val degree = group.flatMap(getDegree) + greedyDecreasing(sc, degree) + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/IncConnectedComponentsRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/IncConnectedComponentsRunner.scala new file mode 100644 index 0000000..b04798e --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/IncConnectedComponentsRunner.scala @@ -0,0 +1,108 @@ +// scalastyle:off + +package com.bigdata.graph + +import java.io.{File, FileWriter, InputStreamReader} +import java.util +import com.bigdata.utils.Utils +import org.apache.spark.graphx.lib.{IncConnectedComponents, Parameters} +import org.apache.spark.sql.SparkSession +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.storage.StorageLevel +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import scala.beans.BeanProperty + +class IncCCConfig extends Serializable { + @BeanProperty var inccc: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class IncCCParams extends Serializable { + @BeanProperty var splitGraph: String = _ + @BeanProperty var partitions: Int = _ + + @BeanProperty var orgCCPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var dataPath: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} + + +object IncConnectedComponentsRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, platformName, isRaw) = (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + val dataPath = args(1) + val outputPath = args(2) + val orgCCPath = args(3) + val incGraphPath = args(4) + val representer = new Representer + representer.addClassTag(classOf[IncCCParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val stream: InputStreamReader = Utils.getStream("conf/graph/inccc/inccc.yml") + val yaml = new Yaml(new Constructor(classOf[IncCCConfig]), representer, options) + val description = new TypeDescription(classOf[IncCCParams]) + yaml.addTypeDescription(description) + val config: IncCCConfig = yaml.load(stream).asInstanceOf[IncCCConfig] + + val params = new IncCCParams() + val paramsMap = + config.inccc.get(isRaw match { + case "no" => "opt" + case _ => "raw" + }).get(datasetName).asInstanceOf[util.HashMap[String, Object]] + params.setSplitGraph(paramsMap.get("splitGraph").toString) + params.setPartitions(paramsMap.get("partitions").toString.toInt) + params.setOrgCCPath(orgCCPath) + params.setDatasetName(datasetName) + params.setDataPath(dataPath) + params.setOutputPath(outputPath) + params.setIsRaw(isRaw) + params.setAlgorithmName("IncCC") + params.setTestcaseType(s"${params.algorithmName}_${datasetName}_${isRaw}") + + val conf = new SparkConf().setAppName(params.testcaseType) + val spark = SparkSession.builder.config(conf).getOrCreate() + val sc = spark.sparkContext + + val startTime = System.currentTimeMillis() + val historyCC = Util.readUndirectDataFromHDFS(sc, orgCCPath, params.splitGraph, params.partitions) + .map(f => (f._1.toLong, f._2.toLong)) + .persist(StorageLevel.MEMORY_AND_DISK_SER) + historyCC.foreachPartition(f => {}) + val incGraph = Util.readUndirectDataFromHDFS(sc, incGraphPath, params.splitGraph, params.partitions) + .map(f => (f._1.toLong, f._2.toLong)) + .persist(StorageLevel.MEMORY_AND_DISK_SER) + incGraph.foreachPartition(f => {}) + val res = IncConnectedComponents.run(incGraph, historyCC) + res.map(f => f._1 + "," + f._2).saveAsTextFile(outputPath) + val finishTime = System.currentTimeMillis() + val costTime = (finishTime - startTime) / 1000 + + params.setCostTime(costTime) + println(s"Exec Successful: costTime: ${costTime}") + + val folder = new File("report") + if (!folder.exists()) { + val mkdir = folder.mkdirs() + println(s"Create dir report ${mkdir}") + } + val writer = new FileWriter( + s"report/${params.testcaseType}_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/IncPageRankRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/IncPageRankRunner.scala new file mode 100644 index 0000000..00b60c0 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/IncPageRankRunner.scala @@ -0,0 +1,123 @@ + +package com.bigdata.graph + +import java.io.FileWriter +import java.util + +import scala.beans.BeanProperty +import scala.collection.mutable + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.SparkConf +import org.apache.spark.graphx.lib.IncPageRank +import org.apache.spark.sql.SparkSession +import org.apache.spark.storage.StorageLevel + +class IncPrConfig extends Serializable { + @BeanProperty var incpr: util.HashMap[String, Object] = _ +} + +class IncPrParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var splitGraph: String = _ + @BeanProperty var numIter: Int = _ + @BeanProperty var resetProb: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var numPartitions: Int = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var partNum: Int = _ +} + +object IncPageRankRunner { + def main(args: Array[String]): Unit = { + try { + val datasetName = args(0) + val inputPath = args(1) + val outputPath = args(2) + val isRaw = args(3) + + val stream = Utils.getStream("conf/graph/incpr/incpr.yml") + + val representer = new Representer + representer.addClassTag(classOf[IncPrParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[IncPrConfig]), representer, options) + val description = new TypeDescription(classOf[IncPrParams]) + yaml.addTypeDescription(description) + val config: IncPrConfig = yaml.load(stream).asInstanceOf[IncPrConfig] + val paramsMap = + config.incpr.get(datasetName).asInstanceOf[util.HashMap[String, Object]] + + val params = new IncPrParams() + + // val inputPath = paramsMap.get("inputPath").toString + // val outputPath = paramsMap.get("outputPath").toString + val numIter = paramsMap.get("numIter").toString.toInt + val resetProb = paramsMap.get("resetProb").toString.toDouble + val partNum = paramsMap.get("partNum").toString.toInt + + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setNumIter(numIter) + params.setResetProb(resetProb) + params.setDatasetName(datasetName) + params.setIsRaw(isRaw) + params.setAlgorithmName("IncPr") + + println("inputPath: " + inputPath) + println("outputPath: " + outputPath) + + val appName = s"IncPageRank_${datasetName}" + params.setTestcaseType(appName) + + val sparkConf = new SparkConf() + .setAppName(appName) + .setMaster("yarn") + + // record start time + val startTime = System.currentTimeMillis() + val spark = SparkSession.builder().config(sparkConf).getOrCreate() + implicit val graph = spark.read + .orc(inputPath) + .rdd + .map(row => (row.getAs[Long]("srcId"), + row.getAs[Int]("srcStatus"), + row.getAs[mutable.WrappedArray[Long]]("dstId").toArray[Long], + row.getAs[mutable.WrappedArray[Int]]("dstStatus").toArray[Int], + row.getAs[Double]("pr"))) + .persist(StorageLevel.MEMORY_ONLY_SER) + graph.foreachPartition(f => {}) + + val res = IncPageRank.run(graph, partNum, numIter, resetProb) + res.map(f => f._1 + "\t" + f._2.formatted("%.6f")).saveAsTextFile(outputPath) + + val finishTime = System.currentTimeMillis() + val costTime = (finishTime - startTime) / 1000 + println(s"IncPageRank Computing Finished. CostTime = $costTime's.") + + params.setCostTime(costTime) + + Utils.checkDirs("report") + val writer = new FileWriter( + s"report/IncPR_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + + println(s"Exec Successful: costTime: ${costTime}s") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/KCoreDecompositionHiveRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/KCoreDecompositionHiveRunner.scala new file mode 100644 index 0000000..3598e53 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/KCoreDecompositionHiveRunner.scala @@ -0,0 +1,72 @@ +package com.bigdata.graph + +import org.apache.spark.graphx.lib.KCoreDecomposition +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.types.{IntegerType, LongType, StringType, StructField, StructType} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.SparkConf + +object KCoreDecompositionHiveRunner { + private val RESULT_SPLIT = "," + + def main(args: Array[String]): Unit = { + if (args.length < 6) { + println(args.mkString(",")) + println("Usage:KCoreDecompositionRunner

") + System.exit(-1) + } + val tableName: String = args(0) + val col1: String = args(1) + val col2: String = args(2) + val partition: Int = args(3).toInt + val saveMode: String = args(4) + val saveArg: String = args(5) + + val appName = s"KCORE_${tableName}" + + try { + val sparkConf: SparkConf = new SparkConf() + .setAppName(appName) + .set("spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive", "true") + val spark: SparkSession = SparkSession.builder().enableHiveSupport().config(sparkConf).getOrCreate() + + val startTime: Long = System.currentTimeMillis() + val sql = s"select * from ${tableName}" + val edges: DataFrame = spark.sql(sql).select(col1, col2) + // string to long + val tmpRDD: RDD[(String, String)] = edges.rdd.map(row => (row(0).toString, row(1).toString)) + val verticesRDD: RDD[(String, Long)] = tmpRDD.flatMap(f => Iterator(f._1, f._2)).distinct.zipWithIndex.cache + verticesRDD.foreachPartition(_ => {}) + println("vertices count:" + verticesRDD.count()) + val indexEdgeRDD: RDD[(Long, Long)] = tmpRDD.leftOuterJoin(verticesRDD, partition).map(f => (f._2._1, f._2._2.get)).leftOuterJoin(verticesRDD, partition).map(f => (f._2._1, f._2._2.get)) + + val result: RDD[(Long, Int)] = KCoreDecomposition.run(indexEdgeRDD) + println("result count: " + result.count()) + val finalResult: RDD[Row] = result.leftOuterJoin(verticesRDD.map(_.swap), partition).map(f => Row(f._2._2.get, f._2._1)) + val _ = saveMode match { + case "hive" => { + val schema_resultMap: StructType = StructType(List(StructField("_node_id", StringType, true), StructField("_coreness", IntegerType, true))) + val resultMapDF: DataFrame = spark.createDataFrame(finalResult, schema_resultMap) + resultMapDF.createOrReplaceTempView("coreMapTmpV") + val outputTableName: String = saveArg + val createSql = s"create table ${outputTableName}_kcore(_node_id varchar(250), _coreness int)" + spark.sql(createSql) + spark.sql(s"insert into ${outputTableName}_kcore select * from coreMapTmpV") + } + case "hdfs" => { + val outputPath: String = saveArg + Util.saveDataToHDFS(result, RESULT_SPLIT, outputPath) + } + case _ => throw new Exception("illegal save mode") + } + + val costTime: Double = (System.currentTimeMillis() - startTime) / 1000.0 + println(s"Exec Successful: KCore Decomposition costTime: ${costTime}s") + spark.close() + } catch { + case e: Throwable => + println(s"Exec Failure:${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/KCoreDecompositionRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/KCoreDecompositionRunner.scala new file mode 100644 index 0000000..3d742a2 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/KCoreDecompositionRunner.scala @@ -0,0 +1,97 @@ +package com.bigdata.graph + +import java.io.FileWriter +import java.util.{HashMap => JHashMap} + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.lib.KCoreDecomposition +import org.apache.spark.{SparkConf, SparkContext} + +class KCoreParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var partition = new JHashMap[String, Int] + @BeanProperty var split = new JHashMap[String, String] + @BeanProperty var iterNum = new JHashMap[String, Int] + + @BeanProperty var datasetName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var curPartition: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + +} + +object KCoreDecompositionRunner { + private val RESULT_SPLIT = "," + private val PARAM_FILEPATH = "conf/graph/kcore/kcore.yml" + + def main(args: Array[String]): Unit = { + if (args.length < 5) { + println(args.mkString(",")) + println("Usage:KCoreDecompositionRunner ") + System.exit(-1) + } + val dataset = args(0) + val inputPath = args(1) + val outputPath = args(2) + val isRaw = args(3) + val cpuName = args(4) + + val representer = new Representer + representer.addClassTag(classOf[KCoreParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[KCoreParams]), representer, options) + val description = new TypeDescription(classOf[KCoreParams]) + yaml.addTypeDescription(description) + val params = yaml.load(Utils.getStream(PARAM_FILEPATH)).asInstanceOf[KCoreParams] + val partition = params.getPartition.get(s"${dataset}_${cpuName}_${isRaw}") + val split = params.getSplit.get(dataset) + val appName = s"KCORE_${dataset}_${cpuName}" + + try { + val sc = new SparkContext(new SparkConf().setAppName(appName)) + val startTime = System.currentTimeMillis() + + val inputRdd = Util.readUndirectDataFromHDFS(sc, inputPath, split, partition) + .map(x => (x._1.trim.toLong, x._2.trim.toLong)) + val result = KCoreDecomposition.run(inputRdd) + Util.saveDataToHDFS(result, RESULT_SPLIT, outputPath) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setCostTime(costTime) + + println(s"Exec Successful: KCore Decomposition costTime: ${costTime}s") + + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setDatasetName(dataset) + params.setIsRaw(isRaw) + params.setCurPartition(s"$partition") + params.setAlgorithmName("KCore") + params.setTestcaseType(s"KCore_${dataset}") + + Utils.checkDirs("report") + val writer = new FileWriter(s"report/KCORE_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + + sc.stop() + } catch { + case e: Throwable => + println(s"Exec Failure:${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/KatzCentrality.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/KatzCentrality.scala new file mode 100644 index 0000000..f444cdd --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/KatzCentrality.scala @@ -0,0 +1,107 @@ +// scalastyle:off + +package com.bigdata.graph +import com.bigdata.utils.Utils + +import org.apache.spark.graphx.lib.{KatzCentrality, Parameters} +import org.apache.spark.sql.SparkSession +import org.apache.spark.{SparkConf, SparkContext} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.io.{File, FileWriter, InputStreamReader} +import java.util +import scala.beans.BeanProperty + +class KatzCentralityConfig extends Serializable { + @BeanProperty var katz: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class KatzCentralityParams extends Serializable { + @BeanProperty var splitGraph: String = _ + @BeanProperty var partitions: Int = _ + @BeanProperty var isWeight: Boolean = _ + @BeanProperty var tol: Double = _ + @BeanProperty var maxIter: Int = _ + @BeanProperty var normalized: Boolean = _ + + @BeanProperty var outputPath: String = _ + @BeanProperty var dataPath: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} + + +object KatzCentralityRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, platformName, isRaw) = (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + val dataPath = args(1) + val outputPath = args(2) + val representer = new Representer + representer.addClassTag(classOf[KatzCentralityParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val stream: InputStreamReader = Utils.getStream("conf/graph/katz/katz.yml") + val yaml = new Yaml(new Constructor(classOf[KatzCentralityConfig]), representer, options) + val description = new TypeDescription(classOf[KatzCentralityParams]) + yaml.addTypeDescription(description) + val config: KatzCentralityConfig = yaml.load(stream).asInstanceOf[KatzCentralityConfig] + + val params = new KatzCentralityParams() + val paramsMap = + config.katz.get(isRaw match { + case "no" => "opt" + case _ => "raw" + }).get(datasetName).asInstanceOf[util.HashMap[String, Object]] + params.setSplitGraph(paramsMap.get("splitGraph").toString) + params.setPartitions(paramsMap.get("partitions").toString.toInt) + params.setIsWeight(paramsMap.get("isWeight").toString.toBoolean) + params.setTol(paramsMap.get("tol").toString.toDouble) + params.setMaxIter(paramsMap.get("maxIter").toString.toInt) + params.setNormalized(paramsMap.get("normalized").toString.toBoolean) + params.setDatasetName(datasetName) + params.setDataPath(dataPath) + params.setOutputPath(outputPath) + params.setIsRaw(isRaw) + params.setAlgorithmName("KatzCentrality") + params.setTestcaseType(s"${params.algorithmName}_${datasetName}_${isRaw}") + + val conf = new SparkConf().setAppName(params.testcaseType) + val spark = SparkSession.builder.config(conf).getOrCreate() + val sc = spark.sparkContext + + val startTime = System.currentTimeMillis() + val edgeRDD = Util.readGraphFromHDFS(sc, params.dataPath, params.splitGraph, params.isWeight, params.partitions) + .map(x => (x._1, x._2, x._3)) + val tmp = edgeRDD.map(f => Edge(f._1, f._2, f._3)) + val g: Graph[Double, Double] = Graph.fromEdges(tmp, 1.0) + val result = KatzCentrality.run(g, params.maxIter, params.tol, params.normalized) + result.map(f => (f._1, f._2)).saveAsTextFile(params.outputPath) + val finishTime = System.currentTimeMillis() + val costTime = (finishTime - startTime) / 1000 + + params.setCostTime(costTime) + println(s"Exec Successful: costTime: ${costTime}") + + val folder = new File("report") + if (!folder.exists()) { + val mkdir = folder.mkdirs() + println(s"Create dir report ${mkdir}") + } + val writer = new FileWriter( + s"report/${params.testcaseType}_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/KcoreMain.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/KcoreMain.scala new file mode 100644 index 0000000..ae2c812 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/KcoreMain.scala @@ -0,0 +1,152 @@ +// scalastyle:off println +package com.bigdata.graph + +import java.io.FileWriter +import java.util.{HashMap => JHashMap} + +import scala.beans.BeanProperty +import scala.collection.immutable.IntMap + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark._ +import org.apache.spark.graphx._ +import org.apache.spark.storage.StorageLevel + +case class Index(clic: String) + +/** + * refer to [[https://github.com/DMGroup-IUPUI/Spark-kCore/blob/be0465f11201bb099747dfa32c212609f8b2bb8b/src/main/scala/KcoreMain.scala]], + * add cycle check and change result store + */ +class KCoreRawParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var partition = new JHashMap[String, Int] + @BeanProperty var split = new JHashMap[String, String] + @BeanProperty var iterNum = new JHashMap[String, Int] + + @BeanProperty var datasetName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var curPartition: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} + +object KCore { + + val initialMsg = "-10" + def mergeMsg(msg1: String, msg2: String): String = msg1 + ":" + msg2 + + def vprog(vertexId: VertexId, + value: (Int, Int, IntMap[Int], Int), + message: String): (Int, Int, IntMap[Int], Int) = { + if (message == initialMsg) { + return (value._1, value._2, value._3, value._4) + } + else { + val msg = message.split(":") + val elems = msg // newWeights.values + var counts: Array[Int] = new Array[Int](value._1 + 1) + for (m <- elems) { + val im = m.toInt + if(im <= value._1) { + counts(im) = counts(im) + 1 + } + else { + counts(value._1) = counts(value._1) + 1 + } + } + var curWeight = 0 // value._4-newWeights.size + for(i <- value._1 to 1 by -1) { + curWeight = curWeight + counts(i) + if(i <= curWeight) { + return (i, value._1, value._3, value._4) + } + } + return (0, value._1, value._3, value._4) + } + } + + def sendMsg(triplet: EdgeTriplet[(Int, Int, IntMap[Int], Int), Int]): Iterator[(VertexId, String)] = { + val sourceVertex = triplet.srcAttr + val destVertex = triplet.dstAttr + return Iterator((triplet.dstId, sourceVertex._1.toString), (triplet.srcId, destVertex._1.toString)) + } + + def main(args: Array[String]) { + val RESULT_SPLIT = "," + val PARAM_FILEPATH = "conf/graph/kcore/kcore.yml" + if (args.length < 5) { + println(args.mkString(",")) + println("Usage:KCore ") + System.exit(-1) + } + val dataset = args(0) + val inputPath = args(1) + val outputPath = args(2) + val isRaw = args(3) + val cpuName = args(4) + + val representer = new Representer + representer.addClassTag(classOf[KCoreRawParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[KCoreRawParams]), representer, options) + val description = new TypeDescription(classOf[KCoreRawParams]) + yaml.addTypeDescription(description) + val params = yaml.load(Utils.getStream(PARAM_FILEPATH)).asInstanceOf[KCoreRawParams] + val partition = params.getPartition.get(s"${dataset}_${cpuName}_${isRaw}") + val split = params.getSplit.get(dataset) + val appName = s"KCORE_RAW_${dataset}_${cpuName}" + val maxIter = params.iterNum.get(dataset) + + // setting up spark environment + val conf: SparkConf = new SparkConf().setAppName(appName) + val sc = new SparkContext(conf) + + val startTime = System.currentTimeMillis() + val graphData = Util.readUndirectDataFromHDFS(sc, inputPath, split, partition) + .map(x => (x._1.toLong, x._2.toLong)) + .persist(StorageLevel.MEMORY_AND_DISK_SER) + val ygraph = Util.convertToGraphXGraph(graphData) + .partitionBy(PartitionStrategy.RandomVertexCut) + .groupEdges((e1, e2) => e1) + .subgraph(epred = edge => edge.srcId != edge.dstId) + + val deg = ygraph.degrees + + val mgraph = ygraph.outerJoinVertices(deg)((id, oldattr, newattr) => newattr.getOrElse(0)) + .mapVertices((id, attr) => (attr, -1, IntMap[Int](), attr)) + ygraph.unpersist() + val minGraph = mgraph.pregel(initialMsg, maxIter, EdgeDirection.Either)(vprog, sendMsg, mergeMsg) + + minGraph.vertices.map(x => s"${x._1}${RESULT_SPLIT}${x._2._1}").saveAsTextFile(outputPath) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setCostTime(costTime) + + println(s"Exec Successful: KCore Decomposition Raw costTime: ${costTime}s") + + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setDatasetName(dataset) + params.setIsRaw(isRaw) + params.setCurPartition(s"$partition") + params.setAlgorithmName("KCoreRaw") + params.setTestcaseType(s"KCore_Raw_${dataset}") + + Utils.checkDirs("report") + val writer = new FileWriter(s"report/KCORE_RAW_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + sc.stop() + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/LabelPropagationRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/LabelPropagationRunner.scala new file mode 100644 index 0000000..dfc1ba2 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/LabelPropagationRunner.scala @@ -0,0 +1,115 @@ +package com.bigdata.graph + +import java.io.FileWriter +import java.util.{HashMap => JHashMap} + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.Graph +import org.apache.spark.graphx.PartitionStrategy.EdgePartition2D +import org.apache.spark.graphx.lib.{LabelPropagation, Modularity} +import org.apache.spark.{SparkConf, SparkContext} + +class LPAParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var partition = new JHashMap[String, Int] + @BeanProperty var partitionNum: Int = _ + @BeanProperty var split: String = _ + @BeanProperty var maxSteps: Int = _ + + @BeanProperty var datasetName: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var modularity: Double = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} + +object LabelPropagationRunner { + private val RESULT_SPLIT = "," + private val PARAM_FILEPATH = "conf/graph/lpa/lpa.yml" + + def main(args: Array[String]): Unit = { + if (args.length < 6) { + println(args.mkString(",")) + println("Usage:LabelPropagationRunner ") + System.exit(-1) + } + val dataset = args(0) + val inputPath = args(1) + val outputPath = args(2) + val api = args(3) + val isRaw = args(4) + val cpuName = args(5) + + val representer = new Representer + representer.addClassTag(classOf[LPAParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[LPAParams]), representer, options) + val description = new TypeDescription(classOf[LPAParams]) + yaml.addTypeDescription(description) + val params = yaml.load(Utils.getStream(PARAM_FILEPATH)).asInstanceOf[LPAParams] + val partition = params.getPartition.get(s"${dataset}_${cpuName}") + val appName = s"LPA_${dataset}_${api}_${cpuName}" + try { + val conf = new SparkConf().setAppName(appName) + if ("runConvergence".equals(api)) { + conf.set("spark.boostkit.graph.lpa.convergence", "true") + } + val sc = new SparkContext(conf) + val startTime = System.currentTimeMillis() + + val input = Util.readUndirectDataFromHDFS(sc, inputPath, params.getSplit, partition) + .flatMap(x => Iterator((x._1.toLong, x._2.toLong))) + val inputGraph = + if ("yes".equals(isRaw)) { + Graph.fromEdgeTuples(input, 0).partitionBy(EdgePartition2D) + } else { + Graph.fromEdgeTuples(input, 0) + } + val result = LabelPropagation.run(inputGraph, params.maxSteps).vertices + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + Util.saveDataToHDFS(result, RESULT_SPLIT, outputPath) + params.setCostTime(costTime) + println(s"Exec Successful: label propagation costTime: ${costTime}s") + + if ("no".equals(isRaw)) { + val nodes = Util.readCommFromHDFS(sc, outputPath, RESULT_SPLIT, partition) + val edges = Util.readGraphFromHDFS(sc, inputPath, params.getSplit, false, partition) + val modularity = Modularity.run(nodes, edges, false, partition) + params.setModularity(modularity) + println(s"Modularity: ${modularity}.") + } + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setPartitionNum(partition) + params.setDatasetName(dataset) + params.setApiName(api) + params.setIsRaw(isRaw) + params.setAlgorithmName("LPA") + params.setTestcaseType(appName) + + Utils.checkDirs("report") + val writer = new FileWriter(s"report/LPA_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + + sc.stop() + } catch { + case e: Throwable => + println(s"Exec Failure:${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/LouvainHiveRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/LouvainHiveRunner.scala new file mode 100644 index 0000000..89d0f0d --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/LouvainHiveRunner.scala @@ -0,0 +1,87 @@ +package com.bigdata.graph + +import org.apache.spark.graphx.lib.Louvain +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.types.{LongType, StringType, StructField, StructType} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.SparkConf + +object LouvainHiveRunner { + + def main(args: Array[String]): Unit = { + if (args.length < 9) { + println("Usage:LouvainRunner ") + System.exit(-1) + } + val tableName: String = args(0) + val col1: String = args(1) + val col2: String = args(2) + val colWeight: String = args(3) + val iterNum: Int = args(4).toInt + val isDirected: Boolean = args(5).toBoolean + val partition: Int = args(6).toInt + val saveMode: String = args(7) + val saveArg: String = args(8) + + try { + val appName = s"Louvain_${tableName}" + val sparkConf: SparkConf = new SparkConf() + .setAppName(appName) + .setMaster("yarn") + val spark: SparkSession = SparkSession.builder().enableHiveSupport().config(sparkConf).getOrCreate() + // record start time + val startTime: Long = System.currentTimeMillis() + + val sql = s"select * from ${tableName}" + var edgesRDD: RDD[(Long, Long, Double)] = null + var verticesRDD: RDD[(String, Long)] = null + if (colWeight == "none") { + // 类型是string,转化为long + val edges: DataFrame = spark.sql(sql).select(col1, col2) + val tmpRDD: RDD[(String, String)] = edges.rdd.map(row => (row(0).toString, row(1).toString)) + verticesRDD = tmpRDD.flatMap(f => Iterator(f._1, f._2)).distinct.zipWithIndex().cache() + edgesRDD = tmpRDD.leftOuterJoin(verticesRDD, partition).map(f => (f._2._1, f._2._2.get)).leftOuterJoin(verticesRDD, partition).map(f => (f._2._1, f._2._2.get, 1.0)) + } else { + // 类型是string,转化为long + val edges: DataFrame = spark.sql(sql).select(col1, col2, colWeight) + val tmpRDD: RDD[(String, (String, Double))] = edges.rdd.map(row => (row(0).toString, (row(1).toString, row(2).toString.toDouble))) + verticesRDD = tmpRDD.flatMap(f => Iterator(f._1, f._2._1)).distinct().zipWithIndex().cache() + verticesRDD.foreachPartition(_ => {}) + println("vertices count:" + verticesRDD.count()) + edgesRDD = tmpRDD.leftOuterJoin(verticesRDD, partition).map(f => (f._2._1._1, (f._2._2.get, f._2._1._2))).leftOuterJoin(verticesRDD, partition).map(f => (f._2._1._1, f._2._2.get, f._2._1._2)) + } + + val (q, comm) = Louvain.run(edgesRDD, iterNum, isDirected, partition) + // long to string + val finalResult: RDD[Row] = comm.leftOuterJoin(verticesRDD.map(_.swap), 200).map(f => Row(f._2._2.get, f._2._1)) + + val _ = saveMode match { + case "hive" => { + val schema_commMap: StructType = StructType(List(StructField("_node_id", StringType, true), StructField("_comm_id", LongType, true))) + val commMapDF: DataFrame = spark.createDataFrame(finalResult, schema_commMap) + commMapDF.createOrReplaceTempView("commMapTmpV") + val outputTableName: String = saveArg + val createSql = s"create table ${outputTableName}_louvain(_node_id varchar(250), _comm_id long)" + spark.sql(createSql) + spark.sql(s"insert into ${outputTableName}_louvain select * from commMapTmpV") + } + case "hdfs" => { + val outputPath: String = saveArg + Util.saveDataToHDFS(comm, ",", outputPath) + } + case _ => throw new Exception("illegal save mode") + } + + val costTime: Double = (System.currentTimeMillis() - startTime) / 1000.0 + val commNum: Int = comm.map(_._2).distinct().count().toInt + + println("louvain modularity:\t%.5f\nComm num:\t%d\n".format(q, commNum)) + println(s"Exec Successful: costTime: ${costTime}s") + spark.close() + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/LouvainRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/LouvainRunner.scala new file mode 100644 index 0000000..8868f8c --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/LouvainRunner.scala @@ -0,0 +1,116 @@ +package com.bigdata.graph + +import java.io.FileWriter +import java.util + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.lib.Louvain +import org.apache.spark.{SparkConf, SparkContext} + +class LouvainConfig extends Serializable { + @BeanProperty var louvain: util.HashMap[String, Object] = _ +} + +class LouvainParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var splitGraph: String = _ + @BeanProperty var maxIterations: Int = _ + @BeanProperty var isDirected: Boolean = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var numPartitions: Int = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var modularity: Double = _ + @BeanProperty var communityNum: Int = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + +} + +object LouvainRunner { + + def main(args: Array[String]): Unit = { + try { + val datasetName = args(0) + val numPartitions = args(1).toInt + val isRaw = args(2) + val inputPath = args(3) + val outputPath = args(4) + + val stream = Utils.getStream("conf/graph/louvain/louvain.yml") + + val representer = new Representer + representer.addClassTag(classOf[LouvainParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[LouvainConfig]), representer, options) + val description = new TypeDescription(classOf[LouvainParams]) + yaml.addTypeDescription(description) + val config: LouvainConfig = yaml.load(stream).asInstanceOf[LouvainConfig] + val paramsMap = + config.louvain + .get(datasetName) + .asInstanceOf[util.HashMap[String, Object]] + + val params = new LouvainParams() + + val splitGraph = paramsMap.get("splitGraph").toString + val maxIterations = paramsMap.get("maxIterations").toString.toInt + val isDirected = paramsMap.get("isDirected").toString.toBoolean + + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setSplitGraph(splitGraph) + params.setMaxIterations(maxIterations) + params.setIsDirected(isDirected) + params.setDatasetName(datasetName) + params.setNumPartitions(numPartitions) + params.setIsRaw(isRaw) + params.setAlgorithmName("Louvain") + params.setTestcaseType(s"Louvain_${datasetName}") + + println("inputPath: " + inputPath) + println("outputPath: " + outputPath) + + var appName = s"Louvain_${datasetName}" + val sparkConf = new SparkConf() + .setAppName(appName) + .setMaster("yarn") + val sc = new SparkContext(sparkConf) + + // record start time + val startTime = System.currentTimeMillis() + + val edgeRDD = + Util.readGraphFromHDFS(sc, inputPath, splitGraph, false, numPartitions) + val (q, comm) = Louvain.run(edgeRDD, maxIterations, isDirected, numPartitions) + Util.saveDataToHDFS(comm, ",", outputPath) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + val commNum = comm.map(_._2).distinct().count().toInt + println("louvain modularity:\t%.5f\nComm num:\t%d\nCost time:\t%.5f".format(q, commNum, costTime)) + params.setCostTime(costTime) + params.setModularity(q) + params.setCommunityNum(commNum) + + Utils.checkDirs("report") + val writer = new FileWriter( + s"report/Louvain_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + + println(s"Exec Successful: costTime: ${costTime}s") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/MSSPRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/MSSPRunner.scala new file mode 100644 index 0000000..37cd0cd --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/MSSPRunner.scala @@ -0,0 +1,113 @@ +package com.bigdata.graph + +import java.io.FileWriter +import java.util + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, Yaml} + +import org.apache.spark.graphx.lib.ShortestPaths +import org.apache.spark.storage.StorageLevel +import org.apache.spark.{SparkConf, SparkContext} + +class MsspConfig extends Serializable { + @BeanProperty var mssp: util.HashMap[String, Object] = _ +} + +class MsspParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var sourcePath: String = _ + @BeanProperty var computePartition: Int = _ + @BeanProperty var splitGraph: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} + +object MSSPRunner { + def main(args: Array[String]): Unit = { + try { + val datasetName = args(0) + val computePartition = args(1).toInt + val inputPath = args(2) + val outputPath = args(3) + val sourcePath = args(4) + val splitGraph = args(5) + val isRaw = args(6) + + val params = new MsspParams() + params.setInputPath(inputPath) + params.setSourcePath(sourcePath) + params.setComputePartition(computePartition) + params.setSplitGraph(splitGraph) + params.setOutputPath(outputPath) + params.setDatasetName(datasetName) + params.setIsRaw(isRaw) + params.setAlgorithmName("Mssp") + + println("inputPath: " + inputPath) + println("sourcePath: " + sourcePath) + println("computePartition: " + computePartition) + println("splitGraph: " + splitGraph) + println("outputPath: " + outputPath) + println("datasetName: " + datasetName) + println("isRaw: " + isRaw) + var appName = s"MSSP_${datasetName}" + if (isRaw.equals("yes")) { + appName = s"MSSP_RAW_${datasetName}" + } + + params.setTestcaseType(appName) + + val sparkConf = new SparkConf().setAppName(appName).setMaster("yarn") + val sc = new SparkContext(sparkConf) + + // record start time + val startTime = System.currentTimeMillis() + + val edgeRDD = Util.readUndirectDataFromHDFS(sc, inputPath, splitGraph, computePartition) + .map(f => (f._1.trim.toLong, f._2.trim.toLong)).persist(StorageLevel.MEMORY_AND_DISK_SER) + + val inputGraph = Util.convertToGraphXGraph(edgeRDD).persist() + val source = Util.loadSourceVertices(sc, sourcePath) + + val res = ShortestPaths.run(inputGraph, source.collect.toSeq).vertices + res.map(f => { + var s = f._1.toString + ":" + f._2.foreach(x => s = s + "(" + x._1.toString + "," + x._2.toString + ")") + s + }).saveAsTextFile(outputPath) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + println("Short Path result = true, and costTime = " + costTime + "s") + + params.setCostTime(costTime) + + Utils.checkDirs("report") + val writer = new FileWriter(s"report/MSSP_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + val representer = new Representer + representer.addClassTag(classOf[MsspParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[MsspConfig]), representer, options) + yaml.dump(params, writer) + + println(s"Exec Successful: costTime: ${costTime}s") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/MaximalCliqueEnumerationHiveRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/MaximalCliqueEnumerationHiveRunner.scala new file mode 100644 index 0000000..6ece3d5 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/MaximalCliqueEnumerationHiveRunner.scala @@ -0,0 +1,84 @@ +package com.bigdata.graph + +import org.apache.spark.SparkConf +import org.apache.spark.graphx.lib.MaximalCliqueEnumeration +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.types.{IntegerType, StructField, StructType, StringType} + +import org.apache.spark.sql.{DataFrame, Row, SparkSession} + +object MaximalCliqueEnumerationHiveRunner { + private val MCE_OUTPUT_SPLIT = "," + + def main(args: Array[String]): Unit = { + if (args.length < 8) { + println("Usage:MaxcliqueEnumerationRunner ") + System.exit(-1) + } + val tableName = args(0) + val col1 = args(1) + val col2 = args(2) + val mink = args(3).toInt + val maxDegree: Int = args(4).toInt + val partition: Int = args(5).toInt + val saveMode = args(6) + val saveArg = args(7) + + try { + val sparkConf: SparkConf = new SparkConf() + .setAppName("mce") + .setMaster("yarn") + val spark = SparkSession.builder().enableHiveSupport().config(sparkConf).getOrCreate() + val startTime = System.currentTimeMillis() + val sql = s"select * from ${tableName}" + + val edges: DataFrame = spark.sql(sql).select(col1, col2) + val edgesRDD: RDD[(String, String)] = edges.rdd.map(row => (row(0).toString, row(1).toString)) + + val result: (RDD[(Int, String)], RDD[(Int, String)]) = MaximalCliqueEnumeration.run(edgesRDD, mink, maxDegree, partition) + val _ = saveMode match { + case "hive" => { + val nodeMap: RDD[Row] = result._1.asInstanceOf[RDD[(Int, String)]].map(t => Row(t._2, t._1)) + val cliqueMap: RDD[Row] = result._2.asInstanceOf[RDD[(Int, String)]].map(t => Row(t._1, t._2)) + val schema_nodeMap: StructType = StructType(List(StructField("_id", StringType, true), StructField("_id_int", IntegerType, true))) + val schema_cliqueMap: StructType = StructType(List(StructField("_id_int", IntegerType, true), StructField("_clique_id", StringType, true))) + + val nodeMapDF: DataFrame = spark.createDataFrame(nodeMap, schema_nodeMap) + val cliqueMapDF: DataFrame = spark.createDataFrame(cliqueMap, schema_cliqueMap) + nodeMapDF.createOrReplaceTempView("nodeMapTmpV") + cliqueMapDF.createOrReplaceTempView("cliqueMapTmpV") + + val outputTableName: String = saveArg + val createSql = s"create table ${outputTableName}_mce_nodeMap(_id varchar(250), _id_int int)" + val createSql2 = s"create table ${outputTableName}_mce_cliqueMap(_id_int int,_clique_id varchar(20))" + spark.sql(createSql) + spark.sql(createSql2) + + spark.sql(s"insert into ${outputTableName}_mce_nodeMap select * from nodeMapTmpV") + spark.sql(s"insert into ${outputTableName}_mce_cliqueMap select * from cliqueMapTmpV") + } + + case "hdfs" => { + val outputPath = saveArg + val mapInfoPath = s"${outputPath}/${tableName}/map_info" + val cliquePath = s"${outputPath}/${tableName}/clique_info" + + Util.saveDataToHDFS(result._1, MCE_OUTPUT_SPLIT, mapInfoPath) + Util.saveDataToHDFS(result._2, MCE_OUTPUT_SPLIT, cliquePath) + + } + case _ => throw new Exception("illegal save mode") + } + + + val costTime: Double = (System.currentTimeMillis() - startTime) / 1000.0 + println(s"Exec Successful:maximal clique detection costTime:${costTime}s") + spark.close() + } catch { + case e: Throwable => + println(s"Exec Failure:${e.getMessage}") + throw e + } + } + +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/MaximalCliqueEnumerationRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/MaximalCliqueEnumerationRunner.scala new file mode 100644 index 0000000..455e6bb --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/MaximalCliqueEnumerationRunner.scala @@ -0,0 +1,91 @@ +package com.bigdata.graph + +import java.io.FileWriter +import java.util.{HashMap => JHashMap} + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.lib.MaximalCliqueEnumeration +import org.apache.spark.sql.SparkSession + +class MaximalCliqueEnumerationParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var partition: JHashMap[String, Int] = new JHashMap[String, Int]() + @BeanProperty var minK: Int = _ + @BeanProperty var maxDegree: Int = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var isRaw: String = "no" + @BeanProperty var costTime: Double = _ + @BeanProperty var split: JHashMap[String, String] = new JHashMap[String, String] + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} + +object MaximalCliqueEnumerationRunner { + private val MCE_PARAM_FILEPATH = "conf/graph/mce/mce.yml" + private val MCE_OUTPUT_SPLIT = "," + + def main(args: Array[String]): Unit = { + if (args.length < 3) { + println("Usage:MaxcliqueEnumerationRunner ") + System.exit(-1) + } + val dataset = args(0) + val inputPath = args(1) + val outputPath = args(2) + + val representer = new Representer + representer.addClassTag(classOf[MaximalCliqueEnumerationParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[MaximalCliqueEnumerationParams]), representer, options) + val description = new TypeDescription(classOf[MaximalCliqueEnumerationParams]) + yaml.addTypeDescription(description) + val params = yaml.load(Utils.getStream(MCE_PARAM_FILEPATH)).asInstanceOf[MaximalCliqueEnumerationParams] + val split = params.getSplit.get(dataset) + val partition = params.getPartition.get(dataset) + val mapInfoPath = s"${outputPath}/map_info" + val cliquePath = s"${outputPath}/clique_info" + + try { + val spark = SparkSession.builder().getOrCreate() + val startTime = System.currentTimeMillis() + + println(s"##start to run test.params:${inputPath},${split},${partition}") + val inputRdd = Util.readUndirectDataFromHDFS(spark.sparkContext, inputPath, split, partition) + println(s"##start to run.params:${params.getMinK},${params.getMaxDegree},${partition}") + val result = MaximalCliqueEnumeration.run(inputRdd, params.getMinK, params.getMaxDegree, partition) + Util.saveDataToHDFS(result._1, MCE_OUTPUT_SPLIT, mapInfoPath) + Util.saveDataToHDFS(result._2, MCE_OUTPUT_SPLIT, cliquePath) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setDatasetName(dataset) + params.setCostTime(costTime) + params.setAlgorithmName("MaximalCliqueEnumeration") + params.setTestcaseType(s"MaximalCliqueEnumeration_${dataset}") + + Utils.checkDirs("report") + val writer = new FileWriter(s"report/MCE_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful:maximal clique detection costTime:${costTime}s") + spark.stop() + } catch { + case e: Throwable => + println(s"Exec Failure:${e.getMessage}") + throw e + } + } + +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/ModularityRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/ModularityRunner.scala new file mode 100644 index 0000000..4c82ad2 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/ModularityRunner.scala @@ -0,0 +1,108 @@ +package com.bigdata.graph + +import java.io.FileWriter +import java.util + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.lib.Modularity +import org.apache.spark.{SparkConf, SparkContext} + +class ModularityConfig extends Serializable { + @BeanProperty var modularity: util.HashMap[String, Object] = _ +} + +class ModularityParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var inputCommunity: String = _ + @BeanProperty var splitGraph: String = _ + @BeanProperty var splitCommunity: String = _ + @BeanProperty var isWeighted: Boolean = _ + @BeanProperty var isDirected: Boolean = _ + @BeanProperty var numPartitions: Int = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var isRaw: String = "no" + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var modularity: Double = _ +} + +object ModularityRunner { + + def main(args: Array[String]): Unit = { + try { + val sparkConf = new SparkConf().setAppName("Modularity").setMaster("yarn") + val sc = new SparkContext(sparkConf) + + val datasetName = args(0) + val inputPath = args(1) + val inputCommunity = args(2) + + val stream = Utils.getStream("conf/graph/modularity/modularity.yml") + + val representer = new Representer + representer.addClassTag(classOf[ModularityParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[ModularityConfig]), representer, options) + val description = new TypeDescription(classOf[ModularityParams]) + yaml.addTypeDescription(description) + val config: ModularityConfig = yaml.load(stream).asInstanceOf[ModularityConfig] + val paramsMap = config.modularity.get(datasetName).asInstanceOf[util.HashMap[String, Object]] + + val params = new ModularityParams() + + val splitGraph = paramsMap.get("splitGraph").toString + val splitCommunity = paramsMap.get("splitCommunity").toString + val isWeighted = paramsMap.get("isWeighted").toString.toBoolean + val isDirected = paramsMap.get("isDirected").toString.toBoolean + val numPartitions = paramsMap.get("numPartitions").toString.toInt + + params.setInputPath(inputPath) + params.setInputCommunity(inputCommunity) + params.setSplitGraph(splitGraph) + params.setSplitCommunity(splitCommunity) + params.setIsDirected(isDirected) + params.setIsWeighted(isWeighted) + params.setNumPartitions(numPartitions) + params.setDatasetName(datasetName) + params.setAlgorithmName("Modularity") + params.setTestcaseType(s"Modularity_${datasetName}") + + println("inputPath: " + inputPath) + println("inputCommunity: " + inputCommunity) + + // record start time + val startTime = System.currentTimeMillis() + val graphRDD = Util.readGraphFromHDFS(sc, inputPath, splitGraph, isWeighted, numPartitions) + val communityRDD = Util.readCommFromHDFS(sc, inputCommunity, splitCommunity, numPartitions) + val q = Modularity.run(communityRDD, graphRDD, isDirected, numPartitions) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + println("modularity: %.5f\nCost time: %.5f".format(q, costTime)) + + params.setCostTime(costTime) + params.setModularity(q) + + Utils.checkDirs("report") + val writer = new FileWriter(s"report/Modularity_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + + println(s"Exec Successful: costTime: ${costTime}s") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/Node2VecRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/Node2VecRunner.scala new file mode 100644 index 0000000..4eed7f9 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/Node2VecRunner.scala @@ -0,0 +1,130 @@ +package com.bigdata.graph + +import java.io.{FileWriter, InputStreamReader} +import java.util + +import scala.beans.BeanProperty + +import com.bigdata.compare.graph.Node2vecVerify +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.lib.{Node2Vec, Params} +import org.apache.spark.{SparkConf, SparkContext} + +class Node2VecConfig extends Serializable{ + @BeanProperty var node2vec: util.HashMap[String, Object] = _ +} + +class Node2VecParams extends Serializable{ + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var isRaw: String = "no" + @BeanProperty var partitions: Int = _ + @BeanProperty var directed: Boolean = _ + @BeanProperty var weighted: Boolean = _ + @BeanProperty var walkLength: Int = _ + @BeanProperty var numWalks: Int = _ + @BeanProperty var p: Double = _ + @BeanProperty var q: Double = _ + @BeanProperty var iteration: Int = _ + @BeanProperty var dimension: Int = _ + @BeanProperty var windowSize: Int = _ + @BeanProperty var splitGraph: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var groundTruthPath: String = _ + @BeanProperty var accuracy: Double = _ +} + +object Node2VecRunner { + + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, platformName) = (modelConfSplit(0), modelConfSplit(1)) + val inputPath = args(1) + val outputPath = args(2) + val groundTruthPath = args(3) + val check = args(4) + + val representer = new Representer + representer.addClassTag(classOf[Node2VecParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val stream: InputStreamReader = Utils.getStream("conf/graph/node2vec/node2vec.yml") + val yaml = new Yaml(new Constructor(classOf[Node2VecConfig]), representer, options) + val description = new TypeDescription(classOf[Node2VecParams]) + yaml.addTypeDescription(description) + val config: Node2VecConfig = yaml.load(stream).asInstanceOf[Node2VecConfig] + val paramsMap: util.HashMap[String, Object] = + config.node2vec + .get(datasetName) + .asInstanceOf[util.HashMap[String, Object]] + + val params = new Node2VecParams() + + params.setDatasetName(datasetName) + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setPartitions(paramsMap.get("partitions").toString.toInt) + params.setDirected(paramsMap.get("directed").toString.toBoolean) + params.setWeighted(paramsMap.get("weighted").toString.toBoolean) + params.setWalkLength(paramsMap.get("walkLength").toString.toInt) + params.setNumWalks(paramsMap.get("numWalks").toString.toInt) + params.setP(paramsMap.get("p").toString.toDouble) + params.setQ(paramsMap.get("q").toString.toDouble) + params.setIteration(paramsMap.get("iteration").toString.toInt) + params.setDimension(paramsMap.get("dimension").toString.toInt) + params.setWindowSize(paramsMap.get("windowSize").toString.toInt) + params.setSplitGraph(paramsMap.get("splitGraph").toString) + params.setAlgorithmName("Node2Vec") + params.setTestcaseType(s"Node2Vec_${datasetName}") + params.setGroundTruthPath(groundTruthPath) + + val conf = new SparkConf().setAppName(s"Node2Vec_${datasetName}_${platformName}") + val sc = new SparkContext(conf) + + val startTime = System.currentTimeMillis() + + val edgeRDD = Util.readEdgeListFromHDFS(sc, inputPath, params.getSplitGraph, params.getWeighted, params.getPartitions) + + val n2vParams = Params(params.getDirected, + params.getWeighted, + params.getP, + params.getQ, + params.getWalkLength, + params.getNumWalks, + params.getIteration, + params.getDimension, + params.getWindowSize) + + val node2vecModel = Node2Vec.run(edgeRDD, n2vParams) + Util.saveNode2VecModel(node2vecModel, params.getOutputPath) + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + + if (check.equals("yes")) { + val acc = Node2vecVerify.main(Array(inputPath, groundTruthPath, outputPath, params.getPartitions.toString)) + params.setAccuracy(acc) + } + + params.setCostTime(costTime) + println(s"Exec Successful: costTime: ${costTime}") + + Utils.checkDirs("report") + val writer = new FileWriter( + s"report/Node2Vec_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } + +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/PageRankHiveRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/PageRankHiveRunner.scala new file mode 100644 index 0000000..36f7b1d --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/PageRankHiveRunner.scala @@ -0,0 +1,84 @@ +package com.bigdata.graph + +import org.apache.spark.graphx.{Edge, Graph} +import org.apache.spark.graphx.lib.PageRank +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.types.{DoubleType, LongType, StringType, StructField, StructType} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.SparkConf + +object PageRankHiveRunner { + + def main(args: Array[String]): Unit = { + if (args.length < 10) { + println("Usage:PageRankRunner ") + System.exit(-1) + } + val tableName: String = args(0) + val col1: String = args(1) + val col2: String = args(2) + val api: String = args(3) + val tol: Double = args(4).toDouble + val resetProb: Double = args(5).toDouble + val numIter: Int = args(6).toInt + val partition: Int = args(7).toInt + val saveMode: String = args(8) + val saveArg: String = args(9) + + try { + val appName = s"PageRank_${tableName}_${api}" + val sparkConf: SparkConf = new SparkConf() + .setAppName(appName) + .setMaster("yarn") + .set("spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive", "true") + val spark: SparkSession = SparkSession.builder().enableHiveSupport().config(sparkConf).getOrCreate() + + // record start time + val startTime: Long = System.currentTimeMillis() + + val sql = s"select * from ${tableName}" + val edges: DataFrame = spark.sql(sql).select(col1, col2) + val edgesRDD: RDD[(String, String)] = edges.rdd.map(row => (row(0).toString, row(1).toString)) + val verticesRDD: RDD[(String, Long)] = edgesRDD.flatMap(f => Iterator(f._1, f._2)).distinct.zipWithIndex().cache() + verticesRDD.foreachPartition(_ => {}) + println("vertices count:" + verticesRDD.count()) + val indexEdgesRDD: RDD[Edge[Double]] = edgesRDD.leftOuterJoin(verticesRDD, partition).map(f => (f._2._1, f._2._2.get)).leftOuterJoin(verticesRDD, partition).map(f => Edge(f._2._1, f._2._2.get, 1.0)) + + val graph: Graph[Double, Double] = Graph.fromEdges(indexEdgesRDD, 1.0) + val result: Graph[Double, Double] = api match { + case "runUntilConvergence" => + PageRank.runUntilConvergence(graph, tol, resetProb) + case "run" => PageRank.run(graph, numIter, resetProb) + case _ => throw new Exception("illegal api") + } + val resultRDD = result.vertices.map(f => (f._1, f._2)) + val finalResult: RDD[Row] = resultRDD.leftOuterJoin(verticesRDD.map(_.swap), partition).map(f => Row(f._2._2.get, f._2._1)) + + + val _ = saveMode match { + case "hive" => { + val schema_resultMap: StructType = StructType(List(StructField("_node_id", StringType, true), StructField("_pagerank", DoubleType, true))) + val resultMapDF: DataFrame = spark.createDataFrame(finalResult, schema_resultMap) + resultMapDF.createOrReplaceTempView("PageRankMapTmpV") + val outputTableName: String = saveArg + val createSql = s"create table ${outputTableName}_pagerank(_node_id varchar(250), _pagerank double)" + spark.sql(createSql) + spark.sql(s"insert into ${outputTableName}_pagerank select * from PageRankMapTmpV") + } + case "hdfs" => { + val outputPath: String = saveArg + result.vertices.map(f => s"${f._1}\t${f._2}").saveAsTextFile(outputPath) + } + case _ => throw new Exception("illegal save mode") + } + + val costTime: Double = (System.currentTimeMillis() - startTime) / 1000.0 + println(s"Exec Successful: costTime: ${costTime}s") + spark.close() + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/PageRankRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/PageRankRunner.scala new file mode 100644 index 0000000..74d9118 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/PageRankRunner.scala @@ -0,0 +1,130 @@ +package com.bigdata.graph + +import java.io.FileWriter +import java.util + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.Graph +import org.apache.spark.graphx.lib.PageRank +import org.apache.spark.{SparkConf, SparkContext} + +class PrConfig extends Serializable { + @BeanProperty var pr: util.HashMap[String, Object] = _ +} + +class PrParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var splitGraph: String = _ + @BeanProperty var numIter: Int = _ + @BeanProperty var resetProb: Double = _ + @BeanProperty var tolerance: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var numPartitions: Int = _ + @BeanProperty var apiName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} + +object PageRankRunner { + + def main(args: Array[String]): Unit = { + try { + val datasetName = args(0) + val api = args(1) + val numPartitions = args(2).toInt + val isRaw = args(3) + val inputPath = args(4) + val outputPath = args(5) + + val stream = Utils.getStream("conf/graph/pr/pr.yml") + + val representer = new Representer + representer.addClassTag(classOf[PrParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[PrConfig]), representer, options) + val description = new TypeDescription(classOf[PrParams]) + yaml.addTypeDescription(description) + val config: PrConfig = yaml.load(stream).asInstanceOf[PrConfig] + val paramsMap = + config.pr.get(datasetName).asInstanceOf[util.HashMap[String, Object]] + + val params = new PrParams() + + val splitGraph = paramsMap.get("splitGraph").toString + val numIter = paramsMap.get("numIter").toString.toInt + val resetProb = paramsMap.get("resetProb").toString.toDouble + var tolerance = paramsMap.get("tolerance").toString.toDouble + + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setSplitGraph(splitGraph) + params.setNumIter(numIter) + params.setResetProb(resetProb) + params.setTolerance(tolerance) + params.setDatasetName(datasetName) + params.setNumPartitions(numPartitions) + params.setApiName(api) + params.setIsRaw(isRaw) + params.setAlgorithmName("Pr") + + println("inputPath: " + inputPath) + println("outputPath: " + outputPath) + + var appName = s"PageRank_${datasetName}_${api}" + if (isRaw == "yes") { + appName = s"PageRank_${datasetName}_${api}_raw" + } + params.setTestcaseType(appName) + + val sparkConf = new SparkConf() + .setAppName(appName) + .setMaster("yarn") + val sc = new SparkContext(sparkConf) + + // record start time + val startTime = System.currentTimeMillis() + + implicit val context = sc + val edges = + Util + .loadLines2EdgesT(inputPath, numPartitions, 1, false, splitGraph) + .setName("OriginalEdges") + val graph = Graph.fromEdges(edges, 0.15D) + val result = api match { + case "runUntilConvergence" => + PageRank.runUntilConvergence(graph, tolerance, resetProb) + case "run" => PageRank.run(graph, numIter, resetProb) + case _ => throw new Exception("illegal api") + } + + result.vertices.map(f => s"${f._1}\t${f._2}").saveAsTextFile(outputPath) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + println("pagerank costTime = " + costTime + "s") + + params.setCostTime(costTime) + + Utils.checkDirs("report") + val writer = new FileWriter( + s"report/PR_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + + println(s"Exec Successful: costTime: ${costTime}s") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/PersonalizedPageRankRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/PersonalizedPageRankRunner.scala new file mode 100644 index 0000000..5ff4f0b --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/PersonalizedPageRankRunner.scala @@ -0,0 +1,132 @@ +package com.bigdata.graph + +import java.io.FileWriter +import java.util + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.lib.PageRank +import org.apache.spark.graphx.{Edge, Graph} +import org.apache.spark.{SparkConf, SparkContext} +class PPrConfig extends Serializable { + @BeanProperty var ppr: util.HashMap[String, Object] = _ +} + +class PPrParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var splitGraph: String = _ + @BeanProperty var numIter: Int = _ + @BeanProperty var resetProb: Double = _ + @BeanProperty var tolerance: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var numPartitions: Int = _ + @BeanProperty var apiName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var sourcesPath: String = _ + @BeanProperty var source: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} +object PersonalizedPageRankRunner { + + def main(args: Array[String]): Unit = { + + try { + val datasetName = args(0) + val api = args(1) + val numPartitions = args(2).toInt + val inputPath = args(3) + val isRaw = args(4) + val source = args(5) + var outputPath = args(6) + + val stream = Utils.getStream("conf/graph/ppr/ppr.yml") + + val representer = new Representer + representer.addClassTag(classOf[PPrParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[PPrConfig]), representer, options) + val description = new TypeDescription(classOf[PPrParams]) + yaml.addTypeDescription(description) + + val config: PPrConfig = yaml.load(stream).asInstanceOf[PPrConfig] + + val paramsMap = config.ppr.get(datasetName).asInstanceOf[util.HashMap[String, Object]] + + val params = new PPrParams() + + var appName = s"PPR_${api}_${datasetName}_${source}" + if (isRaw.equals("yes")) { + appName = s"PPR_${api}_${datasetName}_${source}_raw" + } + + val splitGraph = paramsMap.get("splitGraph").toString + val numIter = paramsMap.get("numIter").toString.toInt + val resetProb = paramsMap.get("resetProb").toString.toDouble + val tolerance = paramsMap.get("tolerance").toString.toDouble + val sourcesPath = s"${paramsMap.get("sourcesPath").toString}/${source}.txt" + + params.setApiName(api) + params.setDatasetName(datasetName) + params.setInputPath(inputPath) + params.setIsRaw(isRaw) + params.setNumIter(numIter) + params.setNumPartitions(numPartitions) + params.setOutputPath(outputPath) + params.setResetProb(resetProb) + params.setSourcesPath(sourcesPath) + params.setTolerance(tolerance) + params.setSource(source) + params.setAlgorithmName("PPr") + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + val sc = new SparkContext(conf) + + val startTime = System.currentTimeMillis() + + val inputRDD = Util.readUndirectDataFromHDFS(sc, inputPath, splitGraph, numPartitions) + .map(f => (f._1.toLong, f._2.toLong)) + val graph = Util.convertToGraphXGraph(inputRDD) + + val result = api match { + case "fixMS" => + val sourcesId = sc.textFile(sourcesPath).map(_.toLong).collect() + PageRank.runParallelPersonalizedPageRank(graph, numIter, resetProb, sourcesId) + case "fixSS" => + PageRank.runWithOptions(graph, numIter, resetProb, Option(source.toLong)) + case "conSS" => + PageRank.runUntilConvergenceWithOptions(graph, tolerance, resetProb, Option(source.toLong)) + case _ => + val empty = sc.emptyRDD[Edge[Double]] + Graph.fromEdges(empty, 0.0) + } + + Util.saveDataToHDFS(result.vertices.map(f => s"${f._1}\t${f._2}"), outputPath) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setCostTime(costTime) + + + Utils.checkDirs("report") + val writer = new FileWriter( + s"report/PPR_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + + println(s"Exec Successful: costTime: ${costTime}s") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/StronglyConnectedComponentsRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/StronglyConnectedComponentsRunner.scala new file mode 100644 index 0000000..35e2be1 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/StronglyConnectedComponentsRunner.scala @@ -0,0 +1,96 @@ +package com.bigdata.graph + +import java.io.FileWriter +import java.util.{HashMap => JHashMap} + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.Graph +import org.apache.spark.graphx.lib.StronglyConnectedComponents +import org.apache.spark.{SparkConf, SparkContext} + +class SCCParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var partition: Int = _ + @BeanProperty var split: JHashMap[String, String] = new JHashMap[String, String] + + @BeanProperty var datasetName: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} + +object StronglyConnectedComponentsRunner { + private val SCC_RESULT_SPLIT = "," + private val SCC_NUM_ITER = Integer.MAX_VALUE + + def main(args: Array[String]): Unit = { + if (args.length < 6) { + println(args.mkString(",")) + println("Usage:StronglyConnectedComponents ") + System.exit(-1) + } + val dataset = args(0) + val inputPath = args(1) + val outputPath = args(2) + val api = args(3) + val isRaw = args(4) + val cpuName = args(5) + val partNum = args(6).toInt + val paramFilepath = "conf/graph/scc/scc.yml" + + val representer = new Representer + representer.addClassTag(classOf[SCCParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[SCCParams]), representer, options) + val description = new TypeDescription(classOf[SCCParams]) + yaml.addTypeDescription(description) + val params = yaml.load(Utils.getStream(paramFilepath)).asInstanceOf[SCCParams] + val split = params.getSplit.get(dataset) + val appName = s"SCC_${dataset}_${api}_${cpuName}" + try { + val sc = new SparkContext(new SparkConf().setAppName(appName)) + val startTime = System.currentTimeMillis() + + val input = Util.readUndirectDataFromHDFS(sc, inputPath, split, partNum) + .flatMap(x => Iterator((x._1.toLong, x._2.toLong))) + val graph = Graph.fromEdgeTuples(input, 0) + val result = StronglyConnectedComponents.run(graph, SCC_NUM_ITER) + Util.saveDataToHDFS(result.vertices, SCC_RESULT_SPLIT, outputPath) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setCostTime(costTime) + params.setDatasetName(dataset) + params.setApiName(api) + params.setIsRaw(isRaw) + params.setAlgorithmName("SCC") + params.setTestcaseType(s"SCC_${dataset}") + + Utils.checkDirs("report") + val writer = new FileWriter(s"report/SCC_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: strongly connected component costTime: ${costTime}s") + + sc.stop() + } catch { + case e: Throwable => + println(s"Exec Failure:${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/SubgraphMatchingRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/SubgraphMatchingRunner.scala new file mode 100644 index 0000000..b3d5858 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/SubgraphMatchingRunner.scala @@ -0,0 +1,112 @@ +package com.bigdata.graph + +import java.io.FileWriter +import java.util + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.lib.SubgraphMatching +import org.apache.spark.{SparkConf, SparkContext} + +class SubgraphMatchingParams extends Serializable{ + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var splitDataGraph: util.HashMap[String, String] = new util.HashMap[String, String]() + @BeanProperty var splitQueryGraph: String = _ + @BeanProperty var taskNum: Int = _ + @BeanProperty var resultNum: Int = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var numPartitions: Int = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var isIdentical: String = _ + @BeanProperty var matchResult: Long = _ +} + + +object SubgraphMatchingRunner { + + def main(args: Array[String]): Unit = { + + try { + val datasetName = args(0) + val queryGraphName = args(1) + val isRaw = args(2) + val isIdentical = args(3) + val outputPath = args(4) + val inputPath = args(5) + val partitionNum = args(6).toInt + val taskNum = args(7).toInt + val queryGraphPath = args(8) + val testcaseType = s"SGM_${datasetName}_${queryGraphName}_${isIdentical}" + + val representer = new Representer + representer.addClassTag(classOf[SubgraphMatchingParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[SubgraphMatchingParams]), representer, options) + val description = new TypeDescription(classOf[SubgraphMatchingParams]) + yaml.addTypeDescription(description) + val params = yaml.load(Utils.getStream("conf/graph/sgm/sgm.yml")).asInstanceOf[SubgraphMatchingParams] + + val splitDataGraph = params.getSplitDataGraph.get(datasetName) + val resultNum = params.getResultNum + val splitQueryGraph = params.getSplitQueryGraph + + params.setAlgorithmName("SGM") + params.setDatasetName(datasetName) + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setIsRaw(isRaw) + params.setNumPartitions(partitionNum) + params.setTestcaseType(testcaseType) + params.setIsIdentical(isIdentical) + params.setTaskNum(taskNum) + + val isIdenticalBool = isIdentical match { + case "Identical" => true + case "unIdentical" => false + } + + val conf = new SparkConf() + .setAppName(testcaseType) + val sc = new SparkContext(conf) + + val startTime = System.currentTimeMillis() + + val inputRDD = Util.readUndirectDataFromHDFS(sc, inputPath, splitDataGraph, partitionNum) + .map(f => (f._1.toLong, f._2.toLong)) + val queryGraphRDD = sc.textFile(queryGraphPath) + val edgelist: Array[(Long, Long)] = queryGraphRDD.map(line => { + val strings = line.split(splitQueryGraph) + (strings(0).toLong, strings(1).toLong) + }).collect() + + val (numSubgraphs, subgraphs) = + SubgraphMatching.run(inputRDD, edgelist, taskNum, resultNum, isIdenticalBool) + + params.setMatchResult(numSubgraphs) + println("total matched results:\t%d".format(numSubgraphs)) + subgraphs.map(x => x.mkString("\t")).saveAsTextFile(outputPath) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setCostTime(costTime) + + Utils.checkDirs("report") + val writer = new FileWriter( + s"report/SGM_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + + println(s"Exec Successful: costTime: ${costTime}s") + + } + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/TrangleCountRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/TrangleCountRunner.scala new file mode 100644 index 0000000..e59ac30 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/TrangleCountRunner.scala @@ -0,0 +1,110 @@ +package com.bigdata.graph + +import java.io.FileWriter +import java.util.{HashMap => JHashMap} + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.SparkConf +import org.apache.spark.graphx.Graph +import org.apache.spark.graphx.lib.TriangleCount +import org.apache.spark.sql.SparkSession + +class TriangleCountParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var partition: Int = _ + @BeanProperty var split: JHashMap[String, String] = new JHashMap[String, String] + @BeanProperty var datasetName: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} + +object TriangleCountRunner { + private val TC_PARAM_FILEPATH = "conf/graph/tc/tc.yml" + private val TC_RESULT_SPLIT = "," + + def main(args: Array[String]): Unit = { + if (args.length < 5) { + println(args.mkString(",")) + println("Usage:TriangleCountRunner ") + System.exit(-1) + } + val dataset = args(0) + val inputPath = args(1) + val outputPath = args(2) + val api = args(3) + val isRaw = args(4) + + val representer = new Representer + representer.addClassTag(classOf[TriangleCountParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[TriangleCountParams]), representer, options) + val description = new TypeDescription(classOf[TriangleCountParams]) + yaml.addTypeDescription(description) + + val params = yaml.load(Utils.getStream(TC_PARAM_FILEPATH)).asInstanceOf[TriangleCountParams] + val split = params.getSplit.get(dataset) + try { + var appName = s"TC_${api}_${dataset}" + if (isRaw.equals("yes")) { + appName = s"TC_RAW_${api}_${dataset}" + } + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder().config(conf).getOrCreate() + val startTime = System.currentTimeMillis() + + val inputRDD = Util.readUndirectDataFromHDFS(spark.sparkContext, inputPath, split, params.getPartition) + .flatMap { x => + if (x._1.toLong == x._2.toLong) { + Iterator.empty + } else if (x._1.toLong > x._2.toLong) { + Iterator((x._2.toLong, x._1.toLong)) + } else { + Iterator((x._1.toLong, x._2.toLong)) + } + + } + val graph = Graph.fromEdgeTuples(inputRDD, 0) + val result = api match { + case "preCanonical" => TriangleCount.runPreCanonicalized(graph).vertices.repartition(params.getPartition) + case _ => TriangleCount.run(graph).vertices.repartition(params.getPartition) + } + Util.saveDataToHDFS(result, TC_RESULT_SPLIT, outputPath) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setCostTime(costTime) + params.setDatasetName(dataset) + params.setApiName(api) + params.setIsRaw(isRaw) + params.setAlgorithmName("TriangleCount") + params.setTestcaseType(appName) + + Utils.checkDirs("report") + val writer = new FileWriter(s"report/TC_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: triangle count costTime: ${costTime}s") + + spark.stop() + } catch { + case e: Throwable => + println(s"Exec Failure:${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/TrillionPageRankRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/TrillionPageRankRunner.scala new file mode 100644 index 0000000..5f42ead --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/TrillionPageRankRunner.scala @@ -0,0 +1,181 @@ +package com.bigdata.graph + +import java.io.FileWriter +import java.util + +import scala.beans.BeanProperty +import scala.collection.mutable +import scala.reflect.ClassTag + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.lib.TrillionPageRank +import org.apache.spark.graphx.{Graph, TripletFields, VertexId} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession +import org.apache.spark.storage.StorageLevel +import org.apache.spark.{HashPartitioner, SparkConf, SparkContext} + +class TrillionPageRankConfig extends Serializable { + @BeanProperty var tpr: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class TrillionPageRankParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var splitGraph: String = _ + @BeanProperty var numIter: Int = _ + @BeanProperty var numPartitions: Int = _ + @BeanProperty var resetProb: Double = _ + @BeanProperty var isOnlySrc: Boolean = _ + + @BeanProperty var costTime: Double = _ + @BeanProperty var isRaw: String = _ +} + +object TrillionPageRankRunner { + def main(args: Array[String]): Unit = { + try { + val datasetName = args(0) + val inputPath = args(1) + val outputPath = args(2) + val isRaw = args(3) + + val stream = Utils.getStream("conf/graph/tpr/tpr.yml") + + val representer = new Representer + representer.addClassTag(classOf[TrillionPageRankParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[TrillionPageRankConfig]), representer, options) + val description = new TypeDescription(classOf[TrillionPageRankParams]) + yaml.addTypeDescription(description) + val config: TrillionPageRankConfig = yaml.load(stream).asInstanceOf[TrillionPageRankConfig] + val paramsMap = + config.tpr.get(datasetName).get(isRaw match { + case "no" => "opt" + case _ => "raw" + }).asInstanceOf[util.HashMap[String, Object]] + + val params = new TrillionPageRankParams() + + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setIsRaw(isRaw) + params.setNumPartitions(paramsMap.get("numPartitions").asInstanceOf[Int]) + params.setSplitGraph(paramsMap.get("splitGraph").asInstanceOf[String]) + params.setNumIter(paramsMap.get("numIter").asInstanceOf[Int]) + params.setResetProb(paramsMap.get("resetProb").asInstanceOf[Double]) + params.setIsOnlySrc(paramsMap.get("isOnlySrc").asInstanceOf[Boolean]) + + println("inputPath: " + inputPath) + println("outputPath: " + outputPath) + + val sparkConf = new SparkConf().setMaster("yarn") + val sc = new SparkContext(sparkConf) + + // record start time + val startTime = System.currentTimeMillis() + val data = datasetName match { + case "twitter_2010" => + val spark = SparkSession.builder().config(sparkConf).getOrCreate() + implicit val graph = spark.read + .orc(inputPath) + .rdd + .map(row => (row.getAs[Long]("srcId"), + (row.getAs[Double]("pr"), + row.getAs[mutable.WrappedArray[Long]]("dstId").toArray[Long]))) + .partitionBy(new HashPartitioner(params.numPartitions)) + .persist(StorageLevel.MEMORY_ONLY_SER) + graph.foreachPartition(f => {}) + graph + case _ => sc.textFile (inputPath, params.numPartitions).map (f => { + val urls = f.split (params.splitGraph) + (urls (0).split ("_") (0).toLong, (urls (0).split ("_") (1).toDouble, urls.drop (1).map (_.toLong) ) ) + }) + } + + val attr = isRaw match { + case "no" => TrillionPageRank.run(data, params.numPartitions, params.numIter, params.resetProb, params.isOnlySrc) + case _ => openCompute(data, params.numPartitions, params.numIter, params.resetProb) + } + attr.map(i => i._1 + "\t" + i._2.formatted("%.6f")).saveAsTextFile(params.outputPath) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + println("TrillionPageRank costTime = " + costTime + "s") + + params.setCostTime(costTime) + + Utils.checkDirs("report") + val writer = new FileWriter( + s"report/TPR_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + + println(s"Exec Successful: costTime: ${costTime}s") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } + + private def openCompute( + edge: RDD[(Long, (Double, Array[Long]))], + part: Int, + numIter: Int, + resetProb: Double): RDD[(Long, Double)] = { + val flatEdges = edge.partitionBy(new HashPartitioner(part)).flatMap(f => f._2._2.map(x => (f._1, x))) + val graph = Graph.fromEdgeTuples(flatEdges, 0, edgeStorageLevel = StorageLevel.MEMORY_ONLY_SER, vertexStorageLevel = StorageLevel.MEMORY_ONLY_SER) + + runWithOptions(graph, numIter, resetProb).vertices + } + + private def runWithOptions[VD: ClassTag, ED: ClassTag]( + graph: Graph[VD, ED], + numIter: Int, + resetProb: Double = 0.15, + srcId: Option[VertexId] = None): Graph[Double, Double] = { + val personalized = srcId.isDefined + val src: VertexId = srcId.getOrElse(-1L) + + var rankGraph: Graph[Double, Double] = graph + .outerJoinVertices(graph.outDegrees) { (vid, vdata, deg) => deg.getOrElse(0) } + .mapTriplets(e => 1.0 / e.srcAttr, TripletFields.Src) + .mapVertices { (id, attr) => + if (!(id != src && personalized)) 1.0 else 0.0 + } + + def delta(u: VertexId, v: VertexId): Double = { + if (u == v) 1.0 else 0.0 + } + + var iteration = 0 + var prevRankGraph: Graph[Double, Double] = null + while (iteration < numIter) { + rankGraph.cache() + + val rankUpdates = rankGraph.aggregateMessages[Double] ( + ctx => ctx.sendToDst(ctx.srcAttr * ctx.attr), _ + _, TripletFields.Src) + prevRankGraph = rankGraph + val rPrb = if (personalized) { + (src: VertexId, id: VertexId) => resetProb * delta(src, id) + } else { + (src: VertexId, id: VertexId) => resetProb + } + + rankGraph = rankGraph.outerJoinVertices(rankUpdates) { + (id, oldRank, msgSumOpt) => rPrb(src, id) + (1.0 - resetProb) * msgSumOpt.getOrElse(0.0) + }.cache() + rankGraph.edges.foreachPartition(x => {}) + prevRankGraph.vertices.unpersist(false) + prevRankGraph.edges.unpersist(false) + + iteration += 1 + } + rankGraph + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/TrustRankRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/TrustRankRunner.scala new file mode 100644 index 0000000..585d500 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/TrustRankRunner.scala @@ -0,0 +1,130 @@ +package com.bigdata.graph + +import java.io.{FileWriter, InputStreamReader} +import java.util + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.Graph +import org.apache.spark.graphx.lib.TrustRank +import org.apache.spark.{SparkConf, SparkContext} + + +class TrustRankConfig extends Serializable{ + @BeanProperty var trustRank: util.HashMap[String, Object] = _ +} + +class TrustRankParams extends Serializable{ + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var seedsPath: String = _ + @BeanProperty var splitGraph: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var computePartitions: Int = _ + @BeanProperty var tol: Double = _ + @BeanProperty var numIter: Int = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var resetProb: Double = _ + @BeanProperty var seedsCnt: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} + + +object TrustRankRunner { + + def main(args: Array[String]): Unit = { + try { + val datsetName = args(0) + val computePartitions = args(1).toInt + val isRaw = args(2) + val inputPath = args(3) + val api = args(4) + val seedsCnt = args(5) + val outputPath = args(6) + + val stream: InputStreamReader = Utils.getStream("conf/graph/tr/tr.yml") + + val representer = new Representer + representer.addClassTag(classOf[TrustRankParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[TrustRankConfig]), representer, options) + val description = new TypeDescription(classOf[TrustRankParams]) + yaml.addTypeDescription(description) + val config: TrustRankConfig = yaml.load(stream).asInstanceOf[TrustRankConfig] + val paramsMap: util.HashMap[String, Object] = config.trustRank + .get(s"${datsetName}_${seedsCnt}") + .asInstanceOf[util.HashMap[String, Object]] + + val params = new TrustRankParams + + val seedsPath: String = paramsMap.get("seedsPath").toString + val splitGraph: String = paramsMap.get("splitGraph").toString + val tol: Double = paramsMap.get("tolerance").toString.toDouble + val numIter = paramsMap.get("numIter").toString.toInt + val resetProb = paramsMap.get("resetProb").toString.toDouble + + params.setSeedsCnt(seedsCnt) + params.setSeedsPath(seedsPath) + params.setSplitGraph(splitGraph) + params.setOutputPath(outputPath) + params.setDatasetName(datsetName) + params.setComputePartitions(computePartitions) + params.setIsRaw(isRaw) + params.setInputPath(inputPath) + params.setApiName(api) + params.setTol(tol) + params.setNumIter(numIter) + params.setResetProb(resetProb) + params.setAlgorithmName("TrustRank") + params.setTestcaseType(s"TrustRank_${datsetName}_${api}_${seedsCnt}") + + val appName = s"TrustRank_${datsetName}_${api}_${seedsCnt}" + + val sparkConf = new SparkConf() + .setMaster("yarn") + .setAppName(appName) + val sc = new SparkContext(sparkConf) + implicit val context = sc + + val startTime = System.currentTimeMillis() + + val edges = Util.loadLines2EdgesT(inputPath, computePartitions, 1.0D, false, splitGraph) + .setName("OriginalEdges") + val g = Graph.fromEdges(edges, 1.0D) + val seeds = sc.textFile(seedsPath, 10).map(_.split("\t")(0).trim.toLong) + + api match { + case "run" => TrustRank.run(g, seeds, numIter, resetProb) + .vertices.map(f => s"${f._1}\t${f._2}") + .saveAsTextFile(params.outputPath) + case "runUntilConvergence" => TrustRank.runUntilConvergence(g, seeds, tol, resetProb) + .vertices.map(f => s"${f._1}\t${f._2}") + .saveAsTextFile(params.outputPath) + case _ => throw new Exception("illegal api") + } + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setCostTime(costTime) + + Utils.checkDirs("report") + val writer = new FileWriter( + s"report/TrustRank_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/Util.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/Util.scala new file mode 100644 index 0000000..d10c479 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/Util.scala @@ -0,0 +1,363 @@ +package com.bigdata.graph + +import java.util.regex.Pattern + +import scala.collection.Map +import scala.reflect.ClassTag + +import smile.math.MathEx.cos + +import org.apache.spark.SparkContext +import org.apache.spark.graphx.{Edge, Graph, VertexId} +import org.apache.spark.ml.linalg.{DenseVector, Vector} +import org.apache.spark.rdd.RDD + +object Util { + + def readDirectDataFromHDFS(sc: SparkContext, + filePath: String, + split: String, + partition: Int): RDD[(String, String, String)] = { + sc.textFile(filePath, partition) + .flatMap(line => { + if (line.startsWith("#")) { + Iterator.empty + } else { + val x = line.split(split) + if (x.length < 3) { + Iterator.empty + } else { + val node1 = x(0) + val node2 = x(1) + val weight = x(2) + Iterator((node1, node2, weight)) + } + } + }) + } + + def readUndirectDataFromHDFS(sc: SparkContext, + filePath: String, + split: String, + partition: Int): RDD[(String, String)] = { + sc.textFile(filePath, partition) + .flatMap(line => { + if (line.startsWith("#")) { + Iterator.empty + } else { + val x = line.split(split) + if (x.length < 2) { + Iterator.empty + } else { + val node1 = x(0) + val node2 = x(1) + Iterator((node1, node2)) + } + } + }) + } + + def readCommFromHDFS(sc: SparkContext, + filePath: String, + split: String, + partition: Int): RDD[(Long, Long)] = { + sc.textFile(filePath, partition) + .flatMap(line => { + if (line.startsWith("#")) { + Iterator.empty + } else { + var mLine = line + if (mLine.contains("(") || mLine.contains(")")) { + mLine = mLine.replaceAll("\\(", "") + mLine = mLine.replaceAll("\\)", "") + } + val x = mLine.split(split) + if (x.length < 2) { + Iterator.empty + } else { + Iterator((x(0).toLong, x(1).toLong)) + } + } + }) + } + + def readGraphFromHDFS(sc: SparkContext, + filePath: String, + split: String, + isWeighted: Boolean, + partition: Int): RDD[(Long, Long, Double)] = { + sc.textFile(filePath, partition) + .flatMap(line => { + if (line.startsWith("#")) { + Iterator.empty + } else { + val x = line.split(split) + if (x.length < 2) { + Iterator.empty + } else { + var w = 1.0 + if (isWeighted) { + w = x(2).toDouble + } + Iterator((x(0).toLong, x(1).toLong, w)) + } + } + }) + } + + def loadLines2EdgesT[ED]( + path: String, + partNum: Int, + defaultValue: ED, + weighted: Boolean, + split: String = ",")(implicit sc: SparkContext): RDD[Edge[ED]] = { + var lines = sc.textFile(path, partNum) + lines = + if (lines.getNumPartitions > partNum) lines.coalesce(partNum) else lines + lines.mapPartitions(iterator => { + val pattern = Pattern.compile(split) + iterator.flatMap(line => { + if (line.startsWith("#")) { + Iterator.empty + } else { + val contents = pattern.split(line) + val srcId = contents(0).trim.toLong + val dstId = contents(1).trim.toLong + if (srcId == dstId) { + Iterator.empty + } else { + if (weighted && contents.length >= 3) { + Iterator(Edge(srcId, dstId, contents(2).asInstanceOf[ED])) + } else { + Iterator(Edge(srcId, dstId, defaultValue)) + } + } + } + }) + }) + } + + def readEdgeListFromHDFS( + sc: SparkContext, + filePath: String, + split: String, + isWeighted: Boolean, + partition: Int): RDD[(VertexId, VertexId, Double)] = { + sc.textFile(filePath, partition) + .flatMap(line => { + if (line.startsWith("#")) { + Iterator.empty + } else { + val x = line.trim.split(split) + if (x.length < 2) { + Iterator.empty + } else { + var w = 1.0 + if (isWeighted && x.length > 2) { + w = x(2).toDouble + } + Iterator.single((x(0).toLong, x(1).toLong, w)) + } + } + }) + } + + def readEdgeFileFromHDFS(sc: SparkContext, + filePath: String, + split: String, + partNum: Int): RDD[(Long, Double)] = { + sc.textFile(filePath).repartition(partNum) + .flatMap(line => { + if (line.startsWith("#")) { + Iterator.empty + } else { + val x = line.trim.split(split) + if (x.length < 2) { + Iterator.empty + } else { + Iterator.single((x(0).toLong, x(1).toDouble)) + } + } + }) + } + + def readTopKResultFromHDFS(sc: SparkContext, + filePath: String, + split: String, + partition: Int): RDD[VertexId] = { + sc.textFile(filePath, partition) + .flatMap(line => { + if (line.startsWith("#")) { + Iterator.empty + } else { + val x = line.trim.split(split) + if (x.length < 2) { + Iterator.empty + } else { + Iterator.single(x.head.toLong) + } + } + }) + } + + def readDataFromHDFSForDegree(sc: SparkContext, + filePath: String, + split: String, + partition: Int): RDD[(Long, Long)] = { + sc.textFile(filePath, partition) + .flatMap(line => { + if (line.startsWith("#") || line.startsWith("%")) { + Iterator.empty + } else { + val x = line.split(split) + if (x.length < 2) { + Iterator.empty + } else { + val node1 = x(0) + val node2 = x(1) + Iterator((node1.toLong, node2.toLong)) + } + } + }) + } + def readDirectWeightDataFromHDFS(sc: SparkContext, + filePath: String, + split: String, + partition: Int): RDD[(String, String, Double)] = { + sc.textFile(filePath, partition) + .flatMap(line => { + if (line.startsWith("#") || line.startsWith("%")) { + Iterator.empty + } else { + val x = line.split(split) + if (x.length < 3) { + Iterator.empty + } else { + var weight = 1.0 + if(x.length == 3){ + weight = x(2).toDouble + } + val node1 = x(0) + val node2 = x(1) + Iterator((node1, node2, weight)) + } + } + }) + } + + def saveNode2VecModel(modelRDD: RDD[(Long, Vector)], output: String): Unit = { + + modelRDD + .map{ case (u, vec) => s"$u ${vec.toArray.mkString("(", ",", ")")}"} + .saveAsTextFile(output) + } + + def saveDataToHDFS[T: ClassTag](data: RDD[T], filePath: String): Unit = { + data.saveAsTextFile(filePath) + } + + def saveDataToHDFS[T: ClassTag, V: ClassTag](data: RDD[(T, V)], + split: String, + filePath: String): Unit = { + data.map(f => f._1 + split + f._2).saveAsTextFile(filePath) + } + + def loadSourceVertices(sc: SparkContext, filePath: String): RDD[Long] = { + sc.textFile(filePath).flatMap { line => + if (line.startsWith("#")) { + Iterator.empty + } else { + Iterator(line.trim.toLong) + } + } + } + + def convertToGraphXGraph(edgeRdd: RDD[(Long, Long)]): Graph[Int, Int] = { + val edges = edgeRdd.map(x => Edge(x._1, x._2, 1)) + val defaultUser = (1) + val graph = Graph.fromEdges(edges, defaultUser) + graph + } + + def buildUnweightedGraph(inputRdd: RDD[(String, String)], + isDirected: Boolean, + defaultVertex: Int = 0, + defaultEdge: Int = 0): Graph[Int, Int] = { + val edges: RDD[Edge[Int]] = inputRdd.flatMap(f => { + val src: VertexId = f._1.toLong + val dst: VertexId = f._2.toLong + + if (isDirected) { + Iterator(Edge(src, dst, defaultEdge)) + } else { + if (src < dst) { + Iterator(Edge(src, dst, defaultEdge)) + } else { + Iterator(Edge(dst, src, defaultEdge)) + } + } + }) + + val graph: Graph[Int, Int] = Graph.fromEdges(edges, defaultVertex) + graph + } + + def readDataUnwieightedFromHDFS(sc: SparkContext, + filePath: String, + split: String, + partition: Int): RDD[(VertexId, VertexId)] = { + sc.textFile(filePath, partition) + .flatMap(line => { + if (line.startsWith("#") || line.startsWith("%")) { + Iterator.empty + } else { + val x = line.split(split) + if (x.length < 2) { + Iterator.empty + } else { + val node1 = x(0) + val node2 = x(1) + Iterator((node1.toLong, node2.toLong)) + } + } + }) + } + + def readEdgeList(sc: SparkContext, filePath: String, split: String, partition: Int): RDD[(Long, Long)] = { + sc.textFile(filePath, partition).flatMap(line => { + if (line.startsWith("#")) { + Iterator.empty + } else { + val x = line.trim.split(split) + if (x.length > 1) { + val src = x(0).toLong + val dst = x(1).toLong + if (src != dst) { + Iterator.single((src, dst)) + } else { + Iterator.empty + } + } else { + Iterator.empty + } + } + }) + } + + def get(modelRDD: RDD[(Long, Vector)]): Map[Long, Vector] = { + modelRDD.collectAsMap() + } + + def distCos(x: Array[Double], y: Array[Double]): Double = cos(x, y) + + def readNode2VecModel(sc: SparkContext, input: String): RDD[(Long, Vector)] = { + val rdd: RDD[(Long, Vector)] = sc.textFile(input).mapPartitions(it => { + val regexp = "([0-9]+) \\((.*)\\)".r + it.map { + case regexp(u, emb) => (u.toLong, new DenseVector(emb.split(",") + .map(_.toDouble)): Vector) + } + }).cache() + rdd + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/WCEHiveRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/WCEHiveRunner.scala new file mode 100644 index 0000000..86123a9 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/WCEHiveRunner.scala @@ -0,0 +1,78 @@ +package com.bigdata.graph + +import org.apache.spark.graphx.lib.WeakCliqueEnumeration +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.SparkConf + +object WCEHiveRunner { + + def main(args: Array[String]): Unit = { + if (args.length < 7) { + println("Usage:WCERunner
") + System.exit(-1) + } + + val tableName: String = args(0) + val col1: String = args(1) + val col2: String = args(2) + val maxIter: Int = args(3).toInt + val maxDegree: Int = args(4).toInt + val saveMode: String = args(5) + val saveArg: String = args(6) + + try { + val sparkConf: SparkConf = new SparkConf() + .setAppName("WCE") + .setMaster("yarn") + .set("spark.hadoop.mapreduce.input.fileinputformat.input.dir.recursive", "true") + val spark: SparkSession = SparkSession.builder().enableHiveSupport().config(sparkConf).getOrCreate() + + // record start time + val startTime: Long = System.currentTimeMillis() + val sql = s"select * from ${tableName}" + val edges: DataFrame = spark.sql(sql).select(col1, col2) + val edgesRDD: RDD[(String, String)] = edges.rdd.map(row => (row(0).toString, row(1).toString)) + + val result: (RDD[(Int, String)], RDD[(Int, Int)]) = WeakCliqueEnumeration.run(edgesRDD, maxIter, maxDegree) + val _ = saveMode match { + case "hive" => { + val nodeMap: RDD[Row] = result._1.asInstanceOf[RDD[(Int, String)]].map(t => Row(t._2, t._1)) + val cliqueMap: RDD[Row] = result._2.asInstanceOf[RDD[(Int, Int)]].map(t => Row(t._1, t._2)) + val schema_nodeMap: StructType = StructType(List(StructField("_id", StringType, true), StructField("_id_int", IntegerType, true))) + val schema_cliqueMap: StructType = StructType(List(StructField("_id_int", IntegerType, true), StructField("_clique_id", IntegerType, true))) + + val nodeMapDF: DataFrame = spark.createDataFrame(nodeMap, schema_nodeMap) + val cliqueMapDF: DataFrame = spark.createDataFrame(cliqueMap, schema_cliqueMap) + nodeMapDF.createOrReplaceTempView("nodeMapTmpV") + cliqueMapDF.createOrReplaceTempView("cliqueMapTmpV") + + val outputTableName: String = saveArg + val createSql = s"create table ${outputTableName}_wce_nodeMap(_id varchar(250), _id_int int)" + val createSql2 = s"create table ${outputTableName}_wce_cliqueMap(_id_int int,_clique_id int)" + spark.sql(createSql) + spark.sql(createSql2) + + spark.sql(s"insert into ${outputTableName}_wce_nodeMap select * from nodeMapTmpV") + spark.sql(s"insert into ${outputTableName}_wce_cliqueMap select * from cliqueMapTmpV") + } + case "hdfs" => { + val outputPath: String = saveArg + Util.saveDataToHDFS(result._1, ",", outputPath + "/map_info") + Util.saveDataToHDFS(result._2, ",", outputPath + "/clique_info") + } + case _ => throw new Exception("illegal save mode") + } + + val costTime: Double = (System.currentTimeMillis() - startTime) / 1000.0 + println("weak clique detection result = true") + println(s"Exec Successful: costTime: ${costTime}s") + spark.close() + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/WCERunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/WCERunner.scala new file mode 100644 index 0000000..f784958 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/WCERunner.scala @@ -0,0 +1,105 @@ +package com.bigdata.graph + +import java.io.FileWriter +import java.util + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.lib.WeakCliqueEnumeration +import org.apache.spark.{SparkConf, SparkContext} + +class WceConfig extends Serializable { + @BeanProperty var wce: util.HashMap[String, Object] = _ +} + +class WceParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var splitGraph: String = _ + @BeanProperty var maxIterations: Int = _ + @BeanProperty var maxDegree: Int = _ + @BeanProperty var numPartitions: Int = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var isRaw: String = "no" + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} + +object WCERunner { + + def main(args: Array[String]): Unit = { + try { + val sparkConf = new SparkConf().setAppName("WCE").setMaster("yarn") + val sc = new SparkContext(sparkConf) + + val datasetName = args(0) + val inputPath = args(1) + val outputPath = args(2) + + // record start time + val startTime = System.currentTimeMillis() + + val stream = Utils.getStream("conf/graph/wce/wce.yml") + + val representer = new Representer + representer.addClassTag(classOf[WceParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[WceConfig]), representer, options) + val description = new TypeDescription(classOf[WceParams]) + yaml.addTypeDescription(description) + val config: WceConfig = yaml.load(stream).asInstanceOf[WceConfig] + val paramsMap = config.wce.get(datasetName).asInstanceOf[util.HashMap[String, Object]] + + val params = new WceParams() + + val splitGraph = paramsMap.get("splitGraph").toString + val maxIterations = paramsMap.get("maxIterations").toString.toInt + val maxDegree = paramsMap.get("maxDegree").toString.toInt + val numPartitions = paramsMap.get("numPartitions").toString.toInt + + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setSplitGraph(splitGraph) + params.setMaxIterations(maxIterations) + params.setMaxDegree(maxDegree) + params.setNumPartitions(numPartitions) + params.setDatasetName(datasetName) + params.setAlgorithmName("Wce") + params.setTestcaseType(s"Wce_${datasetName}") + + println("inputPath: " + inputPath) + println("outputPath: " + outputPath) + + val inputRdd = Util.readUndirectDataFromHDFS(sc, inputPath, splitGraph, numPartitions) + val result = WeakCliqueEnumeration.run(inputRdd, maxIterations.toInt, maxDegree.toInt) + Util.saveDataToHDFS(result._1, ",", outputPath + "/map_info") + Util.saveDataToHDFS(result._2, ",", outputPath + "/clique_info") + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + println("weak clique detection result = true, and costTime = " + costTime + "s") + + params.setCostTime(costTime) + + Utils.checkDirs("report") + val writer = new FileWriter(s"report/WCE_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + + println(s"Exec Successful: costTime: ${costTime}s") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/WeightedLablePropagationRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/WeightedLablePropagationRunner.scala new file mode 100644 index 0000000..191d6dd --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/WeightedLablePropagationRunner.scala @@ -0,0 +1,102 @@ +// scalastyle:off + +package com.bigdata.graph +import com.bigdata.utils.Utils + +import org.apache.spark.graphx.lib._ +import org.apache.spark.graphx.{Edge, Graph, GraphLoader, PartitionStrategy} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.SparkSession +import org.apache.spark.{SparkConf, SparkContext} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.io.{File, FileWriter, InputStreamReader} +import java.util +import scala.beans.BeanProperty + +class WLPAConfig extends Serializable { + @BeanProperty var wlpa: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class WLPAParams extends Serializable { + @BeanProperty var splitGraph: String = _ + @BeanProperty var commputePartition: Int = _ + @BeanProperty var maxIter: Int = _ + + @BeanProperty var outputPath: String = _ + @BeanProperty var dataPath: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} + + +object WeightedLablePropagationRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, platformName, isRaw) = (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + val dataPath = args(1) + val outputPath = args(2) + val representer = new Representer + representer.addClassTag(classOf[WLPAParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val stream: InputStreamReader = Utils.getStream("conf/graph/wlpa/wlpa.yml") + val yaml = new Yaml(new Constructor(classOf[WLPAConfig]), representer, options) + val description = new TypeDescription(classOf[WLPAParams]) + yaml.addTypeDescription(description) + val config: WLPAConfig = yaml.load(stream).asInstanceOf[WLPAConfig] + + val params = new WLPAParams() + val paramsMap = + config.wlpa.get(isRaw match { + case "no" => "opt" + case _ => "raw" + }).get(datasetName).asInstanceOf[util.HashMap[String, Object]] + params.setSplitGraph(paramsMap.get("splitGraph").toString) + params.setPartitions(paramsMap.get("commputePartition").toString.toInt) + params.setMaxIter(paramsMap.get("maxIter").toString.toInt) + params.setDatasetName(datasetName) + params.setDataPath(dataPath) + params.setOutputPath(outputPath) + params.setIsRaw(isRaw) + params.setAlgorithmName("WLPA") + params.setTestcaseType(s"${params.algorithmName}_${datasetName}_${isRaw}") + + val conf = new SparkConf().setAppName(params.testcaseType) + val spark = SparkSession.builder.config(conf).getOrCreate() + val sc = spark.sparkContext + + val startTime = System.currentTimeMillis() + val inputRdd = Util.readDirectWeightDataFromHDFS(sc, params.dataPath, params.splitGraph, params.commputePartition) + .map(f => Edge(f._1.toLong, f._2.toLong, f._3.toDouble)) + val graph = Graph.fromEdges(inputRdd, 1.0) + val result = WLabelPropagation.run(graph, params.maxIter).vertices + Util.saveDataToHDFS(result, ",", outputPath) + val finishTime = System.currentTimeMillis() + val costTime = (finishTime - startTime) / 1000 + + params.setCostTime(costTime) + println(s"Exec Successful: costTime: ${costTime}s") + + val folder = new File("report") + if (!folder.exists()) { + val mkdir = folder.mkdirs() + println(s"Create dir report ${mkdir}") + } + val writer = new FileWriter( + s"report/${params.testcaseType}_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/graph/WeightedPageRankRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/graph/WeightedPageRankRunner.scala new file mode 100644 index 0000000..928c11e --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/graph/WeightedPageRankRunner.scala @@ -0,0 +1,129 @@ +package com.bigdata.graph + +import java.io.FileWriter +import java.util + +import scala.beans.BeanProperty + +import com.bigdata.utils.Utils +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} + +import org.apache.spark.graphx.lib.WeightedPageRank +import org.apache.spark.graphx.{Edge, Graph} +import org.apache.spark.{SparkConf, SparkContext} + +class WprConfig extends Serializable { + @BeanProperty var wpr: util.HashMap[String, util.HashMap[String, Object]] = _ +} + +class WprParams extends Serializable { + @BeanProperty var inputPath: String = _ + @BeanProperty var outputPath: String = _ + @BeanProperty var splitGraph: String = _ + @BeanProperty var numIter: Int = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var partitionNum: Int = _ + @BeanProperty var apiName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var algoTolerance: String = _ + @BeanProperty var computeType: String = _ +} + +object WeightedPageRankRunner { + def main(args: Array[String]): Unit = { + try { + val datasetName = args(0) + val inputPath = args(1) + val outputPath = args(2) + val api = args(3) + val isRaw = args(4) + val splitGraph = args(5) + + val stream = Utils.getStream("conf/graph/wpr/wpr.yml") + + val representer = new Representer + representer.addClassTag(classOf[WprParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[WprConfig]), representer, options) + val description = new TypeDescription(classOf[WprParams]) + yaml.addTypeDescription(description) + val config: WprConfig = yaml.load(stream).asInstanceOf[WprConfig] + // 通过算法名字获取对应配置文件的内容 + val paramsMap = + config.wpr.get(api).get(datasetName).asInstanceOf[util.HashMap[String, Object]] + + val params = new WprParams() + + val partitionNum = paramsMap.get("partitionNum").toString.toInt + val numIter = paramsMap.get("numIter").toString.toInt + val algoTolerance = paramsMap.get("tolerance").toString.toDouble + + params.setInputPath(inputPath) + params.setOutputPath(outputPath) + params.setSplitGraph(splitGraph) + params.setNumIter(numIter) + params.setDatasetName(datasetName) + params.setPartitionNum(partitionNum) + params.setApiName(api) + params.setIsRaw(isRaw) + params.setAlgorithmName("wpr") + + var appName = s"WeightedPageRank_${datasetName}_${api}" + if (isRaw == "yes") { + appName = s"WeightedPageRank_${datasetName}_${api}_raw" + } + params.setTestcaseType(appName) + + val sparkConf = new SparkConf() + .setAppName(appName) + .setMaster("yarn") + val sc = new SparkContext(sparkConf) + + // record start time + val startTime = System.currentTimeMillis() + + implicit val context = sc + val edgeRdd = sc.textFile(inputPath, partitionNum).filter(!_.startsWith("%")).map(f => { + val arr = f.split(splitGraph) + if(arr.length == 3) { + Edge(arr(0).toLong, arr(1).toLong, arr(2).toDouble) + } else { + Edge(arr(0).toLong, arr(1).toLong, 1.0D) + } + }) + val g = Graph.fromEdges(edgeRdd, 1.0) + val result = if (api.toLowerCase().equals("static")) { + WeightedPageRank.run(g, numIter, 0.15) + } else { + WeightedPageRank.runUntilConvergence(g, algoTolerance, 0.15) + } + val sumresult = result.vertices.values.sum() + result.vertices.map(f => s"${f._1},${f._2}").saveAsTextFile(outputPath) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + println(s"WeightedPageRank Computing Finished. sum of WPR result: ${sumresult}") + println(s"WeightedPageRank Computing Finished. total Time Spend: ${costTime} s") + + params.setCostTime(costTime) + + Utils.checkDirs("report") + val writer = new FileWriter( + s"report/WPR_${Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", System.currentTimeMillis())}.yml") + yaml.dump(params, writer) + + println(s"Exec Successful: costTime: ${costTime}s") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/ALSRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/ALSRunner.scala new file mode 100644 index 0000000..08dca8f --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/ALSRunner.scala @@ -0,0 +1,301 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.DownEvaluationVerify + +import org.apache.spark.ml.recommendation.ALS +import org.apache.spark.mllib.recommendation.Rating +import org.apache.spark.mllib.linalg.SparseVector +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.SparkConf +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.ml.param.{ParamMap, ParamPair} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.io.{File, FileWriter} +import java.util +import scala.beans.BeanProperty +import scala.collection.mutable + +class ALSConfig extends Serializable { + @BeanProperty var als: util.HashMap[String, util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]]] = _ +} + +class ALSParams extends Serializable { + @BeanProperty var pt: Int = _ + @BeanProperty var nonnegative: Boolean = _ + @BeanProperty var implicitPrefs: Boolean = _ + @BeanProperty var numItemBlocks: Int = _ + @BeanProperty var numUserBlocks: Int = _ + @BeanProperty var numIterations: Int = _ + @BeanProperty var regParam: Double = _ + @BeanProperty var alpha: Double = _ + + @BeanProperty var trainingDataPath: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var dataStructure: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var evaluation: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var startTime: Long = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object ALSRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (dataStructure, datasetName, apiName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2), modelConfSplit(3), modelConfSplit(4)) + val dataPath = args(1) + val dataPathSplit = dataPath.split(",") + val trainingDataPath = dataPathSplit(0) + val cpuName = args(2) + val sparkConfSplit = args(3).split("_") + val (master, deployMode, numExec, execCores, execMem) = + (sparkConfSplit(0), sparkConfSplit(1), sparkConfSplit(2), sparkConfSplit(3), sparkConfSplit(4)) + val saveResultPath = args(4) + + val stream = Utils.getStream("conf/ml/als/als.yml") + val representer = new Representer + representer.addClassTag(classOf[ALSParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[ALSConfig]), representer, options) + val description = new TypeDescription(classOf[ALSParams]) + yaml.addTypeDescription(description) + val configs: ALSConfig = yaml.load(stream).asInstanceOf[ALSConfig] + val paramsMap: util.HashMap[String, Object] = configs.als.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(dataStructure).get(datasetName) + val params = new ALSParams() + params.setPt(paramsMap.getOrDefault("pt", "1000").asInstanceOf[Int]) + params.setNumIterations(paramsMap.getOrDefault("numIterations", "200").asInstanceOf[Int]) + params.setNonnegative(paramsMap.getOrDefault("nonnegative", "false").asInstanceOf[Boolean]) + params.setImplicitPrefs(paramsMap.getOrDefault("implicitPrefs", "false").asInstanceOf[Boolean]) + params.setNumItemBlocks(paramsMap.getOrDefault("numItemBlocks", "228").asInstanceOf[Int]) + params.setNumUserBlocks(paramsMap.getOrDefault("numUserBlocks", "228").asInstanceOf[Int]) + params.setRegParam(paramsMap.getOrDefault("regParam", "0.0").asInstanceOf[Double]) + params.setAlpha(paramsMap.getOrDefault("alpha", "1.0").asInstanceOf[Double]) + params.setTrainingDataPath(trainingDataPath) + params.setDataStructure(dataStructure) + params.setDatasetName(datasetName) + params.setApiName(apiName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("ALS") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}_${dataStructure}_${apiName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${dataStructure}_${datasetName}_${apiName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${dataStructure}_${datasetName}_${apiName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName).setMaster(master) + val commonParas = Array ( + ("spark.submit.deployMode", deployMode), + ("spark.executor.instances", numExec), + ("spark.executor.cores", execCores), + ("spark.executor.memory", execMem) + ) + conf.setAll(commonParas) + val spark = SparkSession.builder.config(conf).getOrCreate() + println(s"Initialized spark session.") + val startTime = System.currentTimeMillis() + params.setStartTime(startTime) + val sc = spark.sparkContext + + import spark.implicits._ + val rawdata: RDD[SparseVector] = sc.objectFile(dataPath).repartition(params.pt) + val (predictions, costTime) = dataStructure match { + case "dataframe" => + val ratings = Vector2Rating(rawdata).toDF().cache() + println("count: " + ratings.count()) + val mapTime = System.currentTimeMillis() + println("map cost Time[seconds]: " + (mapTime - startTime).toDouble / 1000.0) + new ALSKernel().runDataframeJob(spark, ratings, params) + case "rdd" => + val ratings: RDD[Rating] = Vector2Rating(rawdata).cache() + println("count: " + ratings.count()) + val mapTime = System.currentTimeMillis() + println("map cost Time[seconds]: " + (mapTime - startTime).toDouble / 1000.0) + new ALSKernel().runRDDJob(spark, ratings, params) + } + params.setEvaluation(predictions) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(DownEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + + println(s"Exec Successful: costTime: ${costTime}s; evaluation: ${predictions};isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + + + def Vector2Rating(rawdata: RDD[SparseVector]) : RDD[Rating] = { + val Ratingdata: RDD[Rating] = rawdata.zipWithIndex().flatMap{ + case (v, i) => + val arr = mutable.ArrayBuilder.make[Rating] + arr.sizeHint(v.numActives) + v.foreachActive{(ii, vi) => + arr += Rating(i.toInt, ii, vi.toFloat) + } + arr.result() + } + Ratingdata + } + } +} + +class ALSKernel { + + def runDataframeJob(spark: SparkSession, ratings: DataFrame, params: ALSParams): (Double, Double) = { + val sc = spark.sparkContext + val numIterations = params.numIterations + val nonnegative = params.nonnegative + val implicitPrefs = params.implicitPrefs + val numItemBlocks = params.numItemBlocks + val numUserBlocks = params.numUserBlocks + val regParam = params.regParam + val alpha = params.alpha + + val als = new ALS() + .setMaxIter(numIterations) + .setUserCol("user") + .setItemCol("product") + .setRatingCol("rating") + .setNonnegative(nonnegative) + .setImplicitPrefs(implicitPrefs) + .setNumItemBlocks(numItemBlocks) + .setNumUserBlocks(numUserBlocks) + .setRegParam(regParam) + .setAlpha(alpha) + + val paramMap = ParamMap(als.maxIter -> params.numIterations) + .put(als.regParam, params.regParam) + + val paramMaps: Array[ParamMap] = new Array[ParamMap](2) + for (i <- 0 to paramMaps.size -1) { + paramMaps(i) = ParamMap(als.maxIter -> params.numIterations) + .put(als.regParam, params.regParam) + } + val maxIterParamPair = ParamPair(als.maxIter, params.numIterations) + val regParamPair = ParamPair(als.regParam, params.regParam) + val model = params.apiName match { + case "fit" => als.fit(ratings) + case "fit1" => als.fit(ratings, paramMap) + case "fit2" => + val models = als.fit(ratings, paramMaps) + models(0) + case "fit3" => als.fit(ratings, maxIterParamPair, regParamPair) + } + + val costTime = (System.currentTimeMillis() - params.startTime) / 1000.0 + + model.setColdStartStrategy("drop") + val predictions = model.transform(ratings) + + val res = if (params.implicitPrefs) { + val p = predictions.select("rating", "prediction").rdd + .map{ case Row(label: Double, prediction: Float) => (label, prediction) } + .map{ case (r1, r2) => + val pr = if (r1 > 0.0) 1.0f else 0.0f + val err = (pr - r2) + err * err + }.mean() + println("implicitPrefs Mean Squared Error = " + p) + p + } else { + val p = predictions.select("rating", "prediction").rdd + .map{ case Row(label: Double, prediction: Float) => (label, prediction) } + .map {t => + val err = (t._1 - t._2) + err * err + }.mean() + println("Mean Squared Error = " + p) + p + } + Utils.saveEvaluation(res, params.saveDataPath, sc) + (res, costTime) + } + + def runRDDJob(spark: SparkSession, ratings: RDD[Rating], params: ALSParams): (Double, Double) = { + + val sc = spark.sparkContext + val numIterations = params.numIterations + val nonnegative = params.nonnegative + val implicitPrefs = params.implicitPrefs + val numItemBlocks = params.numItemBlocks + val numUserBlocks = params.numUserBlocks + val regParam = params.regParam + val alpha = params.alpha + + import org.apache.spark.mllib.recommendation.ALS + val rank = 10 + val model = ALS.train(ratings, rank, numIterations, regParam, numUserBlocks) + val costTime = (System.currentTimeMillis() - params.startTime) / 1000.0 + + val usersProducts = ratings.map { case Rating(user, product, rate) => + (user, product) + } + val predictions = + model.predict(usersProducts).map{ case Rating(user, product, rate) => + ((user, product), rate) + } + val ratesAndPreds = ratings.map { case Rating(user, product, rate) => + ((user, product), rate) + }.join(predictions) + + val res = if (implicitPrefs) { + val p = ratesAndPreds.map{ case ((user, product), (r1, r2)) => (r1, r2)} + .map{ case (r1, r2) => + val pr = if (r1 > 0.0) 1.0f else 0.0f + val err = (pr - r2) + err * err + }.mean() + println("implicitPrefs Mean Squared Error = " + p) + p + } else { + val p = ratesAndPreds.map{ case ((user, product), (r1, r2)) => (r1, r2)} + .map {t => + val err = (t._1 - t._2) + err * err + }.mean() + println("Mean Squared Error = " + p) + p + } + Utils.saveEvaluation(res, params.saveDataPath, sc) + (res, costTime) + } + +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/BORunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/BORunner.scala new file mode 100644 index 0000000..dcfd7a4 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/BORunner.scala @@ -0,0 +1,369 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.UpEvaluationVerify + +import org.apache.spark.ml.Pipeline +import org.apache.spark.ml.classification.{GBTClassifier, RandomForestClassifier} +import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, RegressionEvaluator} +import org.apache.spark.ml.feature.{OneHotEncoder, SQLTransformer, StringIndexer, VectorAssembler} +import org.apache.spark.ml.regression.RandomForestRegressor +import org.apache.spark.ml.tuning._ +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.functions.{coalesce, col, lit, mean} +import org.apache.spark.sql.types.DoubleType +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.io.{File, FileWriter, PrintWriter} +import java.nio.file.{Paths, Files} +import java.util +import scala.beans.BeanProperty +import scala.io.Source + +class BOConfig extends Serializable { + @BeanProperty var bo: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class BOParams extends Serializable { + @BeanProperty var partitionNum: Int = _ + + @BeanProperty var dataPath: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var evaluation: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var startTime: Long = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object BORunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + val dataPath = args(1) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/bo/bo.yml") + val representer = new Representer + representer.addClassTag(classOf[BOParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[BOConfig]), representer, options) + val description = new TypeDescription(classOf[BOParams]) + yaml.addTypeDescription(description) + val configs: BOConfig = yaml.load(stream).asInstanceOf[BOConfig] + val paramsMap: util.HashMap[String, Object] = configs.bo.get("opt").get(datasetName) + val params = new BOParams() + params.setPartitionNum(paramsMap.getOrDefault("partitionNum", "1000").asInstanceOf[Int]) + params.setDataPath(dataPath) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("BO") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}" + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder.config(conf).getOrCreate() + println(s"Initialized spark session.") + val startTime = System.currentTimeMillis() + params.setStartTime(startTime) + val sc = spark.sparkContext + + var searchArray:Array[Int] = Array() + var metricArray:Array[Double] = Array() + var timeArray:Array[Double] = Array() + for (a <- 1 to 10) { + val res = if (datasetName == "BostonHousing") { + new BOKernel().bostonHousingRfRegressor(spark, params, a.toLong) + } else if (datasetName == "TitanicRf") { + new BOKernel().titanicRf(spark, params, a.toLong) + } else if (datasetName == "TitanicGBT") { + new BOKernel().titanicGBT(spark, params, a.toLong) + } else { + (0, 0.0, 0.0) + } + searchArray +:= res._1 + metricArray +:= res._2 + timeArray +:= res._3 + } + + val res = metricArray.sum * 1.0 / metricArray.length + val costTime = timeArray.sum * 1.0 / metricArray.length + println(searchArray.mkString(" ")) + println(searchArray.sum * 1.0 / metricArray.length) + println(metricArray.mkString(" ")) + println(res) + println(timeArray.mkString(" ")) + println(costTime) + params.setEvaluation(res) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if (ifCheck.equals("yes")) { + params.setIsCorrect(UpEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + + println(s"Exec Successful: costTime: ${costTime}s; evaluation: ${res};isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class BOKernel { + def bostonHousingRfRegressor(spark: SparkSession, params: BOParams, seed:Long): (Int, Double, Double) ={ + val sc = spark.sparkContext + sc.setLogLevel("WARN") + val trainPath = params.dataPath + val partitionNum = params.partitionNum + val startTime = System.currentTimeMillis() + + val housingData = spark.read.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .option("header", true).option("inferSchema", "true").csv(trainPath).repartition(partitionNum) + val features = housingData.drop("MEDV") + + val Array(trainingData, testData) = housingData.withColumnRenamed("MEDV", "label") + .randomSplit(Array(0.8, 0.2), seed = 42) + trainingData.persist() + testData.persist() + val regressor = new RandomForestRegressor() + + val paramSpace:ParamSpace = new ParamSpace() + paramSpace.addIntParam(regressor.toString(), "numTrees", IntervalRange(3,30,1)) + paramSpace.addIntParam(regressor.toString(), "maxDepth", IntervalRange(2,20,1)) + paramSpace.addDoubleParam(regressor.toString(), "subsamplingRate", ContinueRange(0.5,1.0)) + paramSpace.addDoubleParam(regressor.toString(), "minInfoGain", ContinueRange(0,1.0)) + paramSpace.addIntParam(regressor.toString(), "minInstancesPerNode", DiscreteRange(Seq(1,2,3,5,10,15,20))) + + val featureColumnsNames = features.columns.toArray + val assembler = new VectorAssembler() + .setInputCols(featureColumnsNames) + .setOutputCol("features") + + val pipeline = new Pipeline().setStages(Array(assembler, regressor)) + + val cv = new BayesianCrossValidator() + .setEstimator(pipeline) + .setEvaluator(new RegressionEvaluator().setMetricName("rmse")) + .setEstimatorParamSpace(paramSpace) + .setNumIterations(500) + .setNumFolds(5) + .setParallelism(10) + .setThreshold(3.45) + .setSeed(seed) + + val model = cv.fit(trainingData) + println(cv.getSearchNumber) + println(cv.getBestMetric) + trainingData.unpersist() + testData.unpersist() + val endTime = System.currentTimeMillis() + println((endTime - startTime)/1000.0) + (cv.getSearchNumber, cv.getBestMetric, (endTime - startTime)/1000.0 ) + } + + def titanicRf(spark: SparkSession, params: BOParams, seed:Long): (Int, Double, Double) ={ + val sc = spark.sparkContext + sc.setLogLevel("WARN") + val trainPath = params.dataPath + val partitionNum = params.partitionNum + val startTime = System.currentTimeMillis() + + var dataWithNulls = { + spark.read.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ").option("header", true).csv(trainPath) + .repartition(1) + .withColumn("survived", col("Survived").cast(DoubleType)) + .withColumn("age", col("Age").cast(DoubleType)) + .withColumn("siblings_spouses", col("SibSp").cast(DoubleType)) + .withColumn("parents_children", col("Parch").cast(DoubleType)) + .withColumn("fare", col("Fare").cast(DoubleType)) + .select(col("survived"), col("Name") as "passenger_name", col("Pclass") as "passenger_class", col("Sex") as "sex", + col("age"), col("fare"), col("siblings_spouses"), col("parents_children")) + .repartition(partitionNum) + } + val meanAge = dataWithNulls.select(mean("age")).first.getDouble(0) + val data = dataWithNulls.withColumn("age", coalesce(col("age"), lit(meanAge))).cache() + val titleTransformer = new SQLTransformer("title").setStatement( + s""" + |SELECT * + |, CASE WHEN passenger_name LIKE '%\\.%' THEN split(passenger_name, '\\\\.')[0] + | ELSE 'Nothing' + | END AS passenger_title + |FROM __THIS__ + """.stripMargin + ) + val categoricalCols = Array("passenger_class", "sex", "passenger_title") + val indexCols = categoricalCols.map(_ + "_index") + val oheCols = categoricalCols.map(_ + "_ohe") + val stringIndexers = categoricalCols.map(cc => { + new StringIndexer(s"string_indexer_$cc") + .setHandleInvalid("keep") + .setInputCol(cc) + .setOutputCol(cc + "_index") + }) + val oneHotEncoder = { + new OneHotEncoder("ohe") + .setHandleInvalid("keep") + .setDropLast(false) + .setInputCols(indexCols) + .setOutputCols(oheCols) + } + + val numericalCols = Array("age", "fare", "siblings_spouses", "parents_children") + val vectorAssembler = { + new VectorAssembler("vector_assembler") + .setInputCols(oheCols ++ numericalCols) + .setOutputCol("features") + } + + val rawClassifier = new RandomForestClassifier("rf") + .setFeaturesCol("features") + .setLabelCol("survived") + .setProbabilityCol("survival_prob") + .setRawPredictionCol("survival_raw_pred") + + val pipeline = new Pipeline("pipeline") + .setStages(Array(titleTransformer) ++ stringIndexers ++ Array(oneHotEncoder, vectorAssembler, rawClassifier)) + + val paramSpace:ParamSpace = new ParamSpace() + paramSpace.addIntParam(rawClassifier.toString(), "maxDepth", DiscreteRange(Seq(2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20))) + paramSpace.addIntParam(rawClassifier.toString(), "numTrees", IntervalRange(3,30,1)) + paramSpace.addDoubleParam(rawClassifier.toString(), "minInfoGain", ContinueRange(0,0.1)) + paramSpace.addDoubleParam(rawClassifier.toString(), "subsamplingRate", ContinueRange(0.6,1.0)) + + val cv = new BayesianCrossValidator() + .setEstimator(pipeline) + .setEvaluator(new BinaryClassificationEvaluator() + .setLabelCol("survived") + .setRawPredictionCol("survival_raw_pred")) + .setEstimatorParamSpace(paramSpace) + .setNumIterations(500) + .setThreshold(0.856) + .setNumFolds(5) + .setSeed(seed) + + val model = cv.fit(data) + println(cv.getSearchNumber) + println(cv.getBestMetric) + data.unpersist() + val endTime = System.currentTimeMillis() + println((endTime - startTime)/1000.0) + (cv.getSearchNumber, cv.getBestMetric, (endTime - startTime)/1000.0 ) + } + + def titanicGBT(spark: SparkSession, params: BOParams, seed:Long): (Int, Double, Double) ={ + val sc = spark.sparkContext + sc.setLogLevel("WARN") + val trainPath = params.dataPath + val partitionNum = params.partitionNum + val startTime = System.currentTimeMillis() + + var dataWithNulls = { + spark.read.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ").option("header", true).csv(trainPath) + .repartition(1) + .withColumn("survived", col("Survived").cast(DoubleType)) + .withColumn("age", col("Age").cast(DoubleType)) + .withColumn("siblings_spouses", col("SibSp").cast(DoubleType)) + .withColumn("parents_children", col("Parch").cast(DoubleType)) + .withColumn("fare", col("Fare").cast(DoubleType)) + .select(col("survived"), col("Name") as "passenger_name", col("Pclass") as "passenger_class", col("Sex") as "sex", + col("age"), col("fare"), col("siblings_spouses"), col("parents_children")) + .repartition(partitionNum) + } + val meanAge = dataWithNulls.select(mean("age")).first.getDouble(0) + val data = dataWithNulls.withColumn("age", coalesce(col("age"), lit(meanAge))).cache() + val titleTransformer = new SQLTransformer("title").setStatement( + s""" + |SELECT * + |, CASE WHEN passenger_name LIKE '%\\.%' THEN split(passenger_name, '\\\\.')[0] + | ELSE 'Nothing' + | END AS passenger_title + |FROM __THIS__ + """.stripMargin + ) + val categoricalCols = Array("passenger_class", "sex", "passenger_title") + val indexCols = categoricalCols.map(_ + "_index") + val oheCols = categoricalCols.map(_ + "_ohe") + val stringIndexers = categoricalCols.map(cc => { + new StringIndexer(s"string_indexer_$cc") + .setHandleInvalid("keep") + .setInputCol(cc) + .setOutputCol(cc + "_index") + }) + val oneHotEncoder = { + new OneHotEncoder("ohe") + .setHandleInvalid("keep") + .setDropLast(false) + .setInputCols(indexCols) + .setOutputCols(oheCols) + } + + val numericalCols = Array("age", "fare", "siblings_spouses", "parents_children") + val vectorAssembler = { + new VectorAssembler("vector_assembler") + .setInputCols(oheCols ++ numericalCols) + .setOutputCol("features") + } + + val rawClassifier = new GBTClassifier() + .setFeaturesCol("features") + .setLabelCol("survived") + .setProbabilityCol("survival_prob") + .setRawPredictionCol("survival_raw_pred") + + val pipeline = new Pipeline("pipeline") + .setStages(Array(titleTransformer) ++ stringIndexers ++ Array(oneHotEncoder, vectorAssembler, rawClassifier)) + val paramSpace:ParamSpace = new ParamSpace() + paramSpace.addIntParam(rawClassifier.toString(), "maxIter", IntervalRange(3,10,1)) + paramSpace.addDoubleParam(rawClassifier.toString(), "subsamplingRate", ContinueRange(0.5,1.0)) + paramSpace.addDoubleParam(rawClassifier.toString(), "minInfoGain", ContinueRange(0.0,0.5)) + paramSpace.addIntParam(rawClassifier.toString(), "maxDepth", IntervalRange(3,10,1)) + + val cv = new BayesianCrossValidator() + .setEstimator(pipeline) + .setEvaluator(new BinaryClassificationEvaluator() + .setLabelCol("survived") + .setRawPredictionCol("survival_raw_pred")) + .setEstimatorParamSpace(paramSpace) + .setNumIterations(500) + .setThreshold(0.86) + .setNumFolds(4) + .setSeed(seed) + + val model = cv.fit(data) + println(cv.getSearchNumber) + println(cv.getBestMetric) + data.unpersist() + val endTime = System.currentTimeMillis() + println((endTime - startTime)/1000.0) + (cv.getSearchNumber, cv.getBestMetric, (endTime - startTime)/1000.0 ) + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/CovRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/CovRunner.scala new file mode 100644 index 0000000..265012c --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/CovRunner.scala @@ -0,0 +1,128 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.MatrixVerify + +import org.apache.spark.SparkConf +import org.apache.spark.mllib.linalg.{DenseMatrix, Vectors} +import org.apache.spark.mllib.linalg.distributed.RowMatrix +import org.apache.spark.sql.SparkSession +import org.apache.spark.storage.StorageLevel +import org.apache.hadoop.fs.{FileSystem, Path} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.io.{File, FileWriter} +import java.util +import scala.beans.BeanProperty + +class CovConfig extends Serializable { + + @BeanProperty var cov: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class CovParams extends Serializable { + @BeanProperty var numPartitions: Int = _ + + @BeanProperty var dataPath: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object CovRunner { + + def main(args: Array[String]): Unit = { + + try { + val modelConfSplit = args(0).split("-") + val (datasetName, isRaw, ifCheck) = (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + val dataPath = args(1) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/cov/cov.yml") + val representer = new Representer + representer.addClassTag(classOf[CovParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[CovConfig]), representer, options) + val description = new TypeDescription(classOf[CovParams]) + yaml.addTypeDescription(description) + val config: CovConfig = yaml.load(stream).asInstanceOf[CovConfig] + val paramsMap = config.cov.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + val params = new CovParams() + params.setNumPartitions(paramsMap.get("numPartitions").asInstanceOf[Int]) + params.setDataPath(dataPath) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("Cov") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${datasetName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder.config(conf).getOrCreate() + + val costTime = new CovKernel().runJob(spark, params) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(MatrixVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${params.getCostTime}s;isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class CovKernel { + def runJob(spark: SparkSession,params: CovParams): Double = { + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val startTime = System.currentTimeMillis() + val data = sc.textFile(params.dataPath) + .map(x => Vectors.dense(x.split(",").map(_.toDouble))) + .repartition(params.numPartitions) + .persist(StorageLevel.MEMORY_ONLY) + val matrix = new RowMatrix(data) + val covMat = matrix.computeCovariance().asInstanceOf[DenseMatrix] + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + + MatrixVerify.saveMatrix(covMat, params.saveDataPath, sc) + costTime + } +} + diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/DTBRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/DTBRunner.scala new file mode 100644 index 0000000..58161d5 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/DTBRunner.scala @@ -0,0 +1,295 @@ +package com.bigdata.ml + +import java.io.{File, FileWriter} +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.ml.Pipeline +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import scala.beans.BeanProperty +import java.util +import java.util.Date +import com.bigdata.utils.{DTBucketUtils, Utils} +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.ml.classification.{DecisionTreeClassificationModel, DecisionTreeClassifier} +import org.apache.spark.ml.feature.{DecisionTreeBucketModel, DecisionTreeBucketizer, StringIndexer} +import org.apache.spark.ml.linalg.{SparseVector, Vector => MLVector, Vectors => MLVectors} +import org.apache.spark.ml.param.{ParamMap, ParamPair} +import org.apache.spark.sql.functions.{col, udf} +import org.apache.spark.storage.StorageLevel + +class DTBConfig extends Serializable { + @BeanProperty var dtb: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class DTBParams extends Serializable { + @BeanProperty var pt: Int = _ + @BeanProperty var maxDepth: Int = _ + @BeanProperty var maxBins: Int = _ + @BeanProperty var useNodeIdCache: Boolean = _ + @BeanProperty var checkpointInterval: Int = _ + @BeanProperty var numCopiesInput: Int = _ + @BeanProperty var maxMemoryInMB: Int = _ + @BeanProperty var genericPt: Int = _ + @BeanProperty var featuresType: String = _ + @BeanProperty var bcVariables: Boolean = _ + @BeanProperty var saveBucketedRes: Boolean = _ + @BeanProperty var verifyBucketedRes: Boolean = _ + @BeanProperty var bucketedResPath: String = _ + + @BeanProperty var trainingDataPath: String = _ + @BeanProperty var testDataPath: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var verified: String = _ +} + +object DTBRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, apiName, saveOrVerify, bucketedResPath) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2), modelConfSplit(3)) + + val dataPath = args(1) + val dataPathSplit = dataPath.split(",") + val (trainingDataPath, testDataPath) = (dataPathSplit(0), dataPathSplit(1)) + + val cpuName = args(2) + val isRaw = args(3) + val sparkConfSplit = args(4).split("_") + val (master, deployMode, numExec, execCores, execMem) = + (sparkConfSplit(0), sparkConfSplit(1), sparkConfSplit(2), sparkConfSplit(3), sparkConfSplit(4)) + + val stream = Utils.getStream("conf/ml/dtb/dtb.yml") + val representer = new Representer + representer.addClassTag(classOf[DTBParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[DTBConfig]), representer, options) + val description = new TypeDescription(classOf[DTBParams]) + yaml.addTypeDescription(description) + + val configs: DTBConfig = yaml.load(stream).asInstanceOf[DTBConfig] + val params = new DTBParams() + + val dtbParamMap: util.HashMap[String, Object] = configs.dtb.get(isRaw match { + case "no" => "opt" + case _ => "raw" + }).get(datasetName) + params.setGenericPt(dtbParamMap.getOrDefault("genericPt", "1000").asInstanceOf[Int]) + params.setMaxMemoryInMB(dtbParamMap.getOrDefault("maxMemoryInMB", "256").asInstanceOf[Int]) + params.setPt(dtbParamMap.getOrDefault("pt", "1000").asInstanceOf[Int]) + params.setNumCopiesInput(dtbParamMap.getOrDefault("numCopiesInput", "1").asInstanceOf[Int]) + params.setMaxDepth(dtbParamMap.getOrDefault("maxDepth", "5").asInstanceOf[Int]) + params.setMaxBins(dtbParamMap.getOrDefault("maxBins", "32").asInstanceOf[Int]) + params.setUseNodeIdCache(dtbParamMap.getOrDefault("useNodeIdCache", "false").asInstanceOf[Boolean]) + params.setCheckpointInterval(dtbParamMap.getOrDefault("checkpointInterval", "10").asInstanceOf[Int]) + params.setFeaturesType(dtbParamMap.getOrDefault("featuresType", "array").asInstanceOf[String]) + params.setBcVariables(dtbParamMap.getOrDefault("bcVariables", "false").asInstanceOf[Boolean]) + params.setTrainingDataPath(trainingDataPath) + params.setTestDataPath(testDataPath) + params.setApiName(apiName) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setBucketedResPath(bucketedResPath) + params.setSaveBucketedRes(saveOrVerify.equals("save")) + params.setVerifyBucketedRes(saveOrVerify.equals("verify")) + + val conf = new SparkConf().setMaster(master) + val commonParas = Array ( + ("spark.submit.deployMode", deployMode), + ("spark.executor.instances", numExec), + ("spark.executor.cores", execCores), + ("spark.executor.memory", execMem) + ) + conf.setAll(commonParas) + + var appName = s"DTB_RAW_${datasetName}_${apiName}" + if ("no" == isRaw.asInstanceOf[String]) { + appName = s"DTB_${datasetName}_${apiName}" + conf.set("spark.boostkit.ml.rf.binnedFeaturesDataType", params.featuresType) + conf.set("spark.boostkit.ml.rf.numTrainingDataCopies", params.numCopiesInput.toString) + conf.set("spark.boostkit.ml.rf.numPartsPerTrainingDataCopy", params.pt.toString) + conf.set("spark.boostkit.ml.rf.broadcastVariables", params.bcVariables.toString) + } + conf.setAppName(apiName) + val spark = SparkSession.builder.config(conf).getOrCreate() + val sc = spark.sparkContext + println(s"[KernelEx] initialized spark session. ${new Date().toString}") + + // 检查hdfs的分箱结果文件 + val fs = FileSystem.get(sc.hadoopConfiguration) + val bucketedResFile = new Path(params.bucketedResPath) + if (!params.saveBucketedRes && !params.verifyBucketedRes) { + println("You can only choose Option save or Option verify") + System.exit(-1) + } + if (params.saveBucketedRes && fs.exists(bucketedResFile)) { + if (!fs.delete(bucketedResFile, true)) { + println(s"Bucketed result file/path(${params.bucketedResPath}) can't be deleted!") + System.exit(-1) + } else { + println(s"Bucketed result file/path(${params.bucketedResPath}) is deleted!") + } + assert(!fs.exists(bucketedResFile), s"Bucketed result file/path(${params.bucketedResPath}) is deleted!") + } + if (params.verifyBucketedRes && !fs.exists(bucketedResFile)) { + println(s"Bucketed result verification is enabled. But reference bucketed result file/path(${params.bucketedResPath}) is not found!") + System.exit(-1) + } + val numFeatures = datasetName.toLowerCase match { + case "mnist8m" => 784 + case "higgs" => 28 + } + val numPtTrainData = isRaw match { + case "no" => params.genericPt + case _ => params.pt + } + val startTime = System.currentTimeMillis() + val reader = spark.read.format("libsvm") + val trainingData = reader.load(trainingDataPath).repartition(numPtTrainData).persist(StorageLevel.MEMORY_AND_DISK_SER) + + val labelIndexer = new StringIndexer() + .setInputCol("label") + .setOutputCol("indexedLabel") + .fit(trainingData) + val fmtTrainingData = labelIndexer.transform(trainingData) + + val (bucketedTrainingData, numTrees, trainFinishTime) = isRaw match { + case "no" => + val dtb = new DecisionTreeBucketizer() + .setLabelCol("indexedLabel") + .setFeaturesCol("features") + .setMaxBins(params.maxBins) + .setMaxDepth(params.maxDepth) + .setCacheNodeIds(params.useNodeIdCache) + .setCheckpointInterval(params.checkpointInterval) + .setMaxMemoryInMB(params.maxMemoryInMB) + val pipeline = new Pipeline().setStages(Array(dtb)) + + val paramMap = ParamMap(dtb.maxBins -> params.maxBins) + .put(dtb.maxDepth, params.maxDepth) + val paramMaps: Array[ParamMap] = new Array[ParamMap](2) + for (i <- 0 to paramMaps.size - 1) { + paramMaps(i) = ParamMap(dtb.maxBins -> params.maxBins) + .put(dtb.maxDepth, params.maxDepth) + } + val checkpointIntervalParamPair = ParamPair(dtb.checkpointInterval, params.checkpointInterval) + val models = params.apiName match { + case "fit1" => pipeline.fit(fmtTrainingData, paramMap) + case "fit2" => + val models = pipeline.fit(fmtTrainingData, paramMaps) + models(0) + case "fit3" => pipeline.fit(fmtTrainingData, checkpointIntervalParamPair) + case _ => pipeline.fit(fmtTrainingData) + } + val trainFinishTime = System.currentTimeMillis() + val bucketedTrainingData = models.transform(fmtTrainingData).rdd.map(_.toString()).cache() + bucketedTrainingData.foreachPartition(_ => {}) + val dtbModel = models.stages(0).asInstanceOf[DecisionTreeBucketModel] + println(s"totalNumNodes = ${dtbModel.totalNumNodes}") + (bucketedTrainingData, dtbModel.getNumTrees, trainFinishTime) + case _ => + val dtcModels = Array.range(0, numFeatures).map { index => + val dtc = new DecisionTreeClassifier() + .setLabelCol("indexedLabel") + .setFeaturesCol("features") + .setMaxBins(params.maxBins) + .setMaxDepth(params.maxDepth) + .setCacheNodeIds(params.useNodeIdCache) + .setCheckpointInterval(params.checkpointInterval) + .setMaxMemoryInMB(params.maxMemoryInMB) + val pipeline = new Pipeline().setStages(Array(dtc)) + val curTrainingData = spark.createDataFrame(fmtTrainingData.rdd.map {row => + val featureVal = row.getAs[MLVector]("features")(index) + (row.getAs[Double]("indexedLabel"), MLVectors.dense(Array(featureVal))) + }).toDF("indexedLabel", "features") + + val paramMap = ParamMap(dtc.maxBins -> params.maxBins) + .put(dtc.maxDepth, params.maxDepth) + val paramMaps: Array[ParamMap] = new Array[ParamMap](2) + for (i <- 0 to paramMaps.size - 1) { + paramMaps(i) = ParamMap(dtc.maxBins -> params.maxBins) + .put(dtc.maxDepth, params.maxDepth) + } + val checkpointIntervalParamPair = ParamPair(dtc.checkpointInterval, params.checkpointInterval) + val models = params.apiName match { + case "fit1" => pipeline.fit(curTrainingData, paramMap) + case "fit2" => + val models = pipeline.fit(curTrainingData, paramMaps) + models(0) + case "fit3" => pipeline.fit(curTrainingData, checkpointIntervalParamPair) + case _ => pipeline.fit(curTrainingData) + } + models.stages(0).asInstanceOf[DecisionTreeClassificationModel] + }.map(org.apache.spark.mllib.tree.helper.toOldDTModel) + val trainFinishTime = System.currentTimeMillis() + val numTrees = dtcModels.length + println(s"totalNumNodes = ${dtcModels.map(_.numNodes).sum}") + val func2 = (x: MLVector) => { + val bucketedValues = Range(0, numTrees).map {index => + val model = dtcModels(index) + val treeLeafArray: Array[Int] = DTBucketUtils.getLeafNodes(model.topNode) + val treePredict = DTBucketUtils.predictModify(model.topNode, new SparseVector(1, Array(0), Array(x(index)))) + val bucket_num = treeLeafArray.indexOf(treePredict) + bucket_num.toDouble + }.toArray + MLVectors.dense(bucketedValues) + } + val udfDiscretizerUDF = udf(func2) + val bucketedTrainingData = fmtTrainingData.withColumn(s"bucketedFeatures", udfDiscretizerUDF(col("features"))) + .rdd.map(_.toString()).cache() + bucketedTrainingData.foreachPartition(_ => {}) + (bucketedTrainingData, numTrees, trainFinishTime) + } + val costTime = (System.currentTimeMillis() - startTime) / 1000 + params.setCostTime(costTime) + bucketedTrainingData.take(2).foreach(println) + var verifiedResult = if (params.verifyBucketedRes) "mismatch" else "unknown" + if (params.verifyBucketedRes) { + val output = bucketedTrainingData.repartition(params.genericPt) + val refRes = spark.sparkContext.textFile(params.bucketedResPath).repartition(params.genericPt) + val dataDiff1Cnt = output.subtract(refRes).count() + val dataDiff2Cnt = refRes.subtract(output).count() + if (dataDiff1Cnt != 0 || dataDiff2Cnt != 0) { + System.err.println(s"[ERROR] diff1Cnt: ${dataDiff1Cnt}, diff2Cnt: ${dataDiff2Cnt}") + System.err.println("output data is mismatch!") + verifiedResult = "mismatch" + } else { + println("output data is verified!") + verifiedResult = "verified" + } + } + params.setVerified(verifiedResult) + if (params.saveBucketedRes) { + bucketedTrainingData.repartition(100).saveAsTextFile(params.bucketedResPath) + println("bucketed result saved successful!") + } + + val folder = new File("report") + if (!folder.exists()) { + val mkdir = folder.mkdirs() + println(s"Create dir report ${mkdir}") + } + val writer = new FileWriter(s"report/DTB_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s; verify: ${params.verified}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/DTRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/DTRunner.scala new file mode 100644 index 0000000..b633dd9 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/DTRunner.scala @@ -0,0 +1,422 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.UpEvaluationVerify +import com.bigdata.compare.ml.DownEvaluationVerify + +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.ml.Pipeline +import org.apache.spark.ml.classification.DecisionTreeClassifier +import org.apache.spark.ml.evaluation.{MulticlassClassificationEvaluator, RegressionEvaluator} +import org.apache.spark.ml.feature.StringIndexer +import org.apache.spark.ml.param.{ParamMap, ParamPair} +import org.apache.spark.ml.regression.DecisionTreeRegressor +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.tree.DecisionTree +import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.storage.StorageLevel +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import scala.beans.BeanProperty +import java.util +import java.io.{File, FileWriter} + +class DTConfig extends Serializable { + @BeanProperty var dt: util.HashMap[String, util.HashMap[String, util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]]]] = _ +} + +class DTParams extends Serializable { + @BeanProperty var genericPt: Int = _ + @BeanProperty var maxMemoryInMB: Int = _ + @BeanProperty var pt: Int = _ + @BeanProperty var numCopiesInput: Int = _ + @BeanProperty var maxDepth: Int = _ + @BeanProperty var maxBins: Int = _ + @BeanProperty var numClasses: Int = _ + @BeanProperty var useNodeIdCache: Boolean = _ + @BeanProperty var checkpointInterval: Int = _ + @BeanProperty var featuresType: String = _ + @BeanProperty var bcVariables: Boolean = _ + @BeanProperty var copyStrategy: String = _ + @BeanProperty var useDFCollPtner: String = _ + + @BeanProperty var trainingDataPath: String = _ + @BeanProperty var testDataPath: String = _ + @BeanProperty var algorithmType: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var evaluation: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object DTRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (algorithmType, dataStructure, datasetName, apiName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2), modelConfSplit(3), modelConfSplit(4), modelConfSplit(5)) + val dataPath = args(1) + val dataPathSplit = dataPath.split(",") + val (trainingDataPath, testDataPath) = (dataPathSplit(0), dataPathSplit(1)) + val cpuName = args(2) + val sparkConfSplit = args(3).split("_") + val (master, deployMode, numExec, execCores, execMem) = + (sparkConfSplit(0), sparkConfSplit(1), sparkConfSplit(2), sparkConfSplit(3), sparkConfSplit(4)) + val saveResultPath = args(4) + + val stream = Utils.getStream("conf/ml/dt/dt.yml") + val representer = new Representer + representer.addClassTag(classOf[DTParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[DTConfig]), representer, options) + val description = new TypeDescription(classOf[DTParams]) + yaml.addTypeDescription(description) + val configs: DTConfig = yaml.load(stream).asInstanceOf[DTConfig] + val params = new DTParams() + + val ParamMap: util.HashMap[String, Object] = configs.dt.get(isRaw match { + case "no" => "opt" + case _ => "raw" + }).get(algorithmType).get(dataStructure).get(datasetName) + params.setGenericPt(ParamMap.getOrDefault("genericPt", "1000").asInstanceOf[Int]) + params.setMaxMemoryInMB(ParamMap.getOrDefault("maxMemoryInMB", "256").asInstanceOf[Int]) + params.setPt(ParamMap.getOrDefault("pt", "1000").asInstanceOf[Int]) + params.setNumCopiesInput(ParamMap.getOrDefault("numCopiesInput", "1").asInstanceOf[Int]) + params.setMaxDepth(ParamMap.getOrDefault("maxDepth", "5").asInstanceOf[Int]) + params.setMaxBins(ParamMap.getOrDefault("maxBins", "32").asInstanceOf[Int]) + params.setNumClasses(ParamMap.get("numClasses").asInstanceOf[Int]) + params.setUseNodeIdCache(ParamMap.getOrDefault("useNodeIdCache", "false").asInstanceOf[Boolean]) + params.setCheckpointInterval(ParamMap.getOrDefault("checkpointInterval", "10").asInstanceOf[Int]) + params.setFeaturesType(ParamMap.getOrDefault("featuresType", "array").asInstanceOf[String]) + params.setBcVariables(ParamMap.getOrDefault("bcVariables", "false").asInstanceOf[Boolean]) + params.setCopyStrategy(ParamMap.getOrDefault("copyStrategy", "normal").asInstanceOf[String]) + params.setUseDFCollPtner(ParamMap.getOrDefault("useDFCollPtner", "true").asInstanceOf[String]) + params.setTrainingDataPath(trainingDataPath) + params.setTestDataPath(testDataPath) + params.setAlgorithmType(algorithmType) + params.setApiName(apiName) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("DT") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${algorithmType}_${datasetName}_${dataStructure}_${apiName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${algorithmType}_${datasetName}_${dataStructure}_${apiName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${algorithmType}_${datasetName}_${dataStructure}_${apiName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName).setMaster(master) + val commonParas = Array ( + ("spark.submit.deployMode", deployMode), + ("spark.executor.instances", numExec), + ("spark.executor.cores", execCores), + ("spark.executor.memory", execMem) + ) + conf.setAll(commonParas) + if ("no" == isRaw.asInstanceOf[String]) { + conf.set("spark.boostkit.ml.rf.binnedFeaturesDataType", + ParamMap.get("featuresType").asInstanceOf[String]) + conf.set("spark.boostkit.ml.rf.numTrainingDataCopies", + ParamMap.get("numCopiesInput").asInstanceOf[Int].toString) + conf.set("spark.boostkit.ml.rf.numPartsPerTrainingDataCopy", + ParamMap.get("pt").asInstanceOf[Int].toString) + conf.set("spark.boostkit.ml.rf.broadcastVariables", + ParamMap.get("bcVariables").asInstanceOf[Boolean].toString) + conf.set("spark.boostkit.ml.rf.copyStrategy", + ParamMap.get("copyStrategy").asInstanceOf[String]) + conf.set("spark.boostkit.ml.rf.useDFCollPartitioner", + ParamMap.get("useDFCollPtner").asInstanceOf[String]) + if (dataStructure == "rdd") { + conf.set("spark.boostkit.ml.rf.maxBins", + ParamMap.get("maxBins").asInstanceOf[Int].toString) + conf.set("spark.boostkit.ml.rf.maxMemoryInMB", + ParamMap.get("maxMemoryInMB").asInstanceOf[Int].toString) + } + } + val spark = SparkSession.builder.config(conf).getOrCreate() + + val (res, costTime) = dataStructure match { + case "dataframe" => new DTKernel().dtDataframeJob(spark, params) + case "rdd" => new DTKernel().dtRDDJob(spark, params) + } + params.setEvaluation(res) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + val isCorrect = params.algorithmType match { + case "classification" => UpEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark) + case "regression" => DownEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark) + } + params.setIsCorrect(isCorrect) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s; evaluation: ${res};isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class DTKernel { + def dtDataframeJob(spark: SparkSession, params: DTParams): (Double, Double) = { + val sc = spark.sparkContext + val pt = params.pt + val trainingDataPath = params.trainingDataPath + val testDataPath = params.testDataPath + val maxDepth = params.maxDepth + val maxBins = params.maxBins + val useNodeIdCache = params.useNodeIdCache + val checkpointInterval = params.checkpointInterval + val maxMemoryInMB = params.maxMemoryInMB + val genericPt = params.genericPt + + println(s"Initialized spark session.") + val startTime = System.currentTimeMillis() + + val indexLabel: Boolean = params.datasetName match { + case "mnist8m" => + false + case "higgs" => + false + case "epsilon" => + true + case "rcv" => + true + case _ => + true + } + var indexLabelDone = false + val trainingLabelColName = if (indexLabel) "indexedLabel" else "label" + + val reader = spark.read.format("libsvm") + if (params.datasetName == "mnist8m") { + reader.option("numFeatures",784) + } else if (params.datasetName == "higgs") { + reader.option("numFeatures",28) + } else if (params.datasetName == "epsilon") { + reader.option("numFeatures", 2000) + } else if (params.datasetName == "rcv") { + reader.option("numFeatures", 47236) + } + + val numPtTrainData = if ("no" == params.isRaw) genericPt else pt + println(s"numPtTrainData = ${(numPtTrainData)}") + val trainingData = { + var trainingData = reader + .load(trainingDataPath) + .repartition(numPtTrainData) + if (indexLabel && params.algorithmType == "classification") { + params.datasetName match { + case "epsilon" | "rcv" => + trainingData = trainingData.selectExpr("if(label < 0.0, 0, 1) as indexedLabel", "features") + indexLabelDone = true + case _ => + println(s"index Label by StringIndexer because of unknown dataset") + } + } + trainingData.persist(StorageLevel.MEMORY_AND_DISK_SER) + trainingData + } + + //for implementing different fit APIs + val maxBinsJY = 10 + val maxDepthJY = 3 + + // Train a RandomForest model + val dTree = params.algorithmType match { + case "classification" =>{ + val oldDt = new DecisionTreeClassifier() + .setLabelCol(trainingLabelColName) + .setFeaturesCol("features") + .setMaxDepth(maxDepthJY) + .setMaxBins(maxBinsJY) + .setCacheNodeIds(useNodeIdCache) + .setCheckpointInterval(checkpointInterval) + .setMaxMemoryInMB(maxMemoryInMB) + if (params.apiName == "fit"){ + oldDt.setMaxBins(maxBins) + oldDt.setMaxDepth(maxDepth) + } + oldDt + } + case "regression" =>{ + val oldDt = new DecisionTreeRegressor() + .setLabelCol("label") + .setFeaturesCol("features") + .setMaxDepth(maxDepthJY) + .setMaxBins(maxBinsJY) + .setCacheNodeIds(useNodeIdCache) + .setCheckpointInterval(checkpointInterval) + .setMaxMemoryInMB(maxMemoryInMB) + if (params.apiName == "fit"){ + oldDt.setMaxBins(maxBins) + oldDt.setMaxDepth(maxDepth) + } + oldDt + } + } + + val pipeline = if (!indexLabelDone && params.algorithmType == "classification") { + val labelIndexer = new StringIndexer() + .setInputCol("label") + .setOutputCol("indexedLabel") + .fit(trainingData) + new Pipeline() + .setStages(Array(labelIndexer, dTree)) + } else { + new Pipeline() + .setStages(Array(dTree)) + } + + val paramMap = ParamMap(dTree.maxDepth ->params. maxDepth) + .put(dTree.maxBins, params.maxBins) + + val paramMaps = new Array[ParamMap](2) + for (i <- 0 until paramMaps.size){ + paramMaps(i) = ParamMap(dTree.maxDepth -> params.maxDepth) + .put(dTree.maxBins, params.maxBins) + } + + val maxDepthParamPair = ParamPair(dTree.maxDepth, params.maxDepth) + val maxBinsParamPair = ParamPair(dTree.maxBins, params.maxBins) + + val model = params.apiName match { + case "fit" => pipeline.fit(trainingData) + case "fit1" => pipeline.fit(trainingData, paramMap) + case "fit2" => + val models = pipeline.fit(trainingData, paramMaps) + models(0) + case "fit3" => pipeline.fit(trainingData, maxDepthParamPair, maxBinsParamPair) + } + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + + val testData = { + var testData = reader + .load(testDataPath) + .repartition(genericPt) + if (indexLabel && params.algorithmType == "classification") { + params.datasetName match { + case "epsilon" | "rcv" => + testData = testData.selectExpr("if(label < 0.0, 0, 1) as indexedLabel", "features") + case _ => + println(s"index Label by StringIndexer because of unknown dataset") + } + } + testData.persist(StorageLevel.MEMORY_AND_DISK_SER) + testData + } + + // Make predictions. + val predictions = model.transform(testData) + + // Select (prediction, true label) and compute test error. + val evaluator = params.algorithmType match { + case "classification" => + new MulticlassClassificationEvaluator() + .setLabelCol (trainingLabelColName) + .setPredictionCol ("prediction") + .setMetricName ("accuracy") + + case "regression" => + new RegressionEvaluator() + .setLabelCol ("label") + .setPredictionCol ("prediction") + .setMetricName ("rmse") + } + val res = evaluator.evaluate(predictions) + Utils.saveEvaluation(res, params.saveDataPath, sc) + (res, costTime) + } + + def dtRDDJob(spark: SparkSession, params: DTParams): (Double, Double) = { + + val pt = params.pt + val trainingDataPath = params.trainingDataPath + val testDataPath = params.testDataPath + var maxDepth = params.maxDepth + val maxBins = params.maxBins + val genericPt = params.genericPt + var numClasses = params.numClasses + + val sc = spark.sparkContext + println(s"Initialized spark session.") + val startTime = System.currentTimeMillis() + + val numFeatures = params.datasetName match { + case "mnist8m" => 784 + case "higgs" => 28 + case "epsilon" =>2000 + case "rcv" => 47236 + } + + val numPtTrainData = if ("no" == params.isRaw) genericPt else pt + val trainingData = MLUtils.loadLibSVMFile(sc, trainingDataPath, numFeatures) + .repartition(numPtTrainData) + .persist(StorageLevel.MEMORY_AND_DISK_SER) + val trainingLabelPositive = trainingData.map(i=> if (i.label < 0) { + LabeledPoint(0.0, i.features) + } else { + LabeledPoint (i.label, i.features) + }) + + val model = params.algorithmType match { + case "classification" => + DecisionTree.trainClassifier(trainingLabelPositive, numClasses, Map.empty[Int, Int], "gini", maxDepth, maxBins) + case "regression" => + DecisionTree.trainRegressor(trainingLabelPositive, Map.empty[Int, Int], "variance", maxDepth, maxBins) + } + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + + val testData = MLUtils.loadLibSVMFile(sc, testDataPath, numFeatures) + .repartition(genericPt) + .persist(StorageLevel.MEMORY_AND_DISK_SER) + val testLabelPositive = testData.map(i=> if (i.label < 0) { + LabeledPoint(0.0, i.features) + } else { + LabeledPoint (i.label, i.features) + }) + + val labeleAndPreds = testLabelPositive.map{ point => + val prediction = model.predict(point.features) + (point.label, prediction) + } + val res = params.algorithmType match { + case "classification" => 1.0 - labeleAndPreds.filter(r => r._1 == r._2).count.toDouble / testLabelPositive.count() + case "regression" => math.sqrt(labeleAndPreds.map{ case(v, p) => math.pow((v - p), 2)}.mean()) + } + Utils.saveEvaluation(res, params.saveDataPath, sc) + (res, costTime) + } + +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/EncoderRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/EncoderRunner.scala new file mode 100644 index 0000000..62b2ba4 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/EncoderRunner.scala @@ -0,0 +1,162 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.EncoderVerify + +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.ml.feature.FeatureEncoding +import org.apache.spark.ml.feature.FeatureEncodingOrigin + +import java.io.FileWriter +import java.util +import java.util.Date +import scala.beans.BeanProperty + + +class EncoderConfig extends Serializable { + @BeanProperty var encoder: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class EncoderParams extends Serializable { + @BeanProperty var encodeColumns: String = _ + @BeanProperty var numThread: Int = _ + + @BeanProperty var dataPath: String = _ + @BeanProperty var mapLoadPath: String = _ + @BeanProperty var localSavePath: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var startTime: Long = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object EncoderRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + val pathSplit = args(1).split(",") + val (dataPath, mapLoadPath, localSavePath) = + (pathSplit(0), pathSplit(1), pathSplit(2)) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/encoder/encoder.yml") + val representer = new Representer + representer.addClassTag(classOf[EncoderParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[EncoderConfig]), representer, options) + val description = new TypeDescription(classOf[EncoderParams]) + yaml.addTypeDescription(description) + val configs: EncoderConfig = yaml.load(stream).asInstanceOf[EncoderConfig] + val paramsMap: util.HashMap[String, Object] = configs.encoder.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + val params = new EncoderParams() + params.setEncodeColumns(paramsMap.get("encodeColumns").asInstanceOf[String]) + params.setNumThread(paramsMap.get("numThread").asInstanceOf[Int]) + params.setDataPath(dataPath) + params.setMapLoadPath(mapLoadPath) + params.setLocalSavePath(localSavePath) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("Encoder") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${datasetName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder.config(conf).getOrCreate() + spark.sparkContext.setLogLevel("ERROR") + val costTime = new EncoderKernel().runJob(spark, params) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(EncoderVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s;isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class EncoderKernel { + def runJob(spark: SparkSession, params: EncoderParams): Double = { + val sc = spark.sparkContext + val startTime = System.currentTimeMillis() + + import spark.implicits._ + val df = spark.sparkContext.textFile(params.dataPath).map{ + t => + val row = t.split(",") + (row(0), row(1), row(2), row(3), row(4), row(5), row(6), row(7), row(8), row(9), + row(10), row(11), row(12), row(13), row(14)) + }.toDF("1xxx","2xxx","3xxx","4xxx","5xxx","6xxx","7xxx","8xxx","9xxx","10xxx","11xxx","12xxx","13xxx","14xxx","15xxx") + .repartition(800) + .cache() + + val encoder = params.isRaw match { + case "yes" => { + val fe = new FeatureEncodingOrigin() + .setMapLoadPath(params.mapLoadPath) + .setDataPath(params.dataPath) + .setOutputFilePath(params.saveDataPath) + .setLocalSavePath(params.localSavePath) + .setEncodeColumns(params.encodeColumns) + .setNumThread(params.numThread) + fe.execute(df) + fe + } + case "no" => { + val fe = new FeatureEncoding() + .setMapLoadPath(params.mapLoadPath) + .setDataPath(params.dataPath) + .setOutputFilePath(params.saveDataPath) + .setLocalSavePath(params.localSavePath) + .setEncodeColumns(params.encodeColumns) + .setNumThread(params.numThread) + fe.execute(df) + fe + } + } + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + costTime + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/FMRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/FMRunner.scala new file mode 100644 index 0000000..8feec82 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/FMRunner.scala @@ -0,0 +1,252 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.UpEvaluationVerify +import com.bigdata.compare.ml.DownEvaluationVerify + +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.functions.{udf, when} +import org.apache.spark.ml.Pipeline +import org.apache.spark.ml.classification.FMClassifier +import org.apache.spark.ml.param.{ParamMap, ParamPair} +import org.apache.spark.ml.regression.FMRegressor +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.io.{File, FileWriter, PrintWriter} +import java.nio.file.{Paths, Files} +import java.util +import scala.beans.BeanProperty +import scala.io.Source + +class FMConfig extends Serializable { + @BeanProperty var fm: util.HashMap[String, util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]]] = _ +} + +class FMParams extends Serializable { + @BeanProperty var pt: Int = _ + @BeanProperty var numFeatures: Int = _ + @BeanProperty var sparseOrDense: String = _ + @BeanProperty var regParam: Double = _ + @BeanProperty var numIterations: Int = _ + @BeanProperty var tolerance: Double = _ + + @BeanProperty var trainingDataPath: String = _ + @BeanProperty var testDataPath: String = _ + @BeanProperty var algorithmType: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var evaluation: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object FMRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (algorithmType, datasetName, apiName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2), modelConfSplit(3), modelConfSplit(4)) + val dataPath = args(1) + val dataPathSplit = dataPath.split(",") + val (trainingDataPath, testDataPath) = (dataPathSplit(0), dataPathSplit(1)) + val cpuName = args(2) + val sparkConfSplit = args(3).split("_") + val (master, deployMode, numExec, execCores, execMem) = + (sparkConfSplit(0), sparkConfSplit(1), sparkConfSplit(2), sparkConfSplit(3), sparkConfSplit(4)) + val saveResultPath = args(4) + + val stream = Utils.getStream("conf/ml/fm/fm.yml") + val representer = new Representer + representer.addClassTag(classOf[FMParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[FMConfig]), representer, options) + val description = new TypeDescription(classOf[FMParams]) + yaml.addTypeDescription(description) + val configs: FMConfig = yaml.load(stream).asInstanceOf[FMConfig] + val params = new FMParams() + val paramsMap: util.HashMap[String, Object] = configs.fm.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(algorithmType).get(datasetName) + params.setPt(paramsMap.getOrDefault("pt", "276").asInstanceOf[Int]) + params.setNumFeatures(paramsMap.getOrDefault("numFeatures", "28").asInstanceOf[Int]) + params.setSparseOrDense(paramsMap.getOrDefault("sparseOrDense", "dense").asInstanceOf[String]) + params.setRegParam(paramsMap.getOrDefault("regParam", "0.0").asInstanceOf[Double]) + params.setNumIterations(paramsMap.getOrDefault("numIterations", "5000").asInstanceOf[Int]) + params.setTolerance(paramsMap.getOrDefault("tolerance", "1E-6").asInstanceOf[Double]) + params.setTrainingDataPath(trainingDataPath) + params.setTestDataPath(testDataPath) + params.setAlgorithmType(algorithmType) + params.setApiName(apiName) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("FM") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${algorithmType}_${datasetName}_${apiName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${algorithmType}_${datasetName}_${apiName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${algorithmType}_${datasetName}_${apiName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName).setMaster(master) + val commonParas = Array ( + ("spark.submit.deployMode", deployMode), + ("spark.executor.instances", numExec), + ("spark.executor.cores", execCores), + ("spark.executor.memory", execMem) + ) + conf.setAll(commonParas) + if (isRaw.equals("no") && (datasetName.equals("higgs") || datasetName.equals("epsilon"))) { + conf.set("spark.boostkit.mllib.optimization.LBFGSN.costFun.opt", "false") + } + val spark = SparkSession.builder.config(conf).getOrCreate() + + val (res, costTime) = new FMKernel().fmDataframeJob(spark, params) + params.setEvaluation(res) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + val isCorrect = params.algorithmType match { + case "classification" => UpEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark) + case "regression" => DownEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark) + } + params.setIsCorrect(isCorrect) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s; evaluation: ${res};isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class FMKernel { + def fmDataframeJob(spark: SparkSession, params: FMParams): (Double, Double) = { + val sc = spark.sparkContext + val pt = params.pt + val algorithmType = params.algorithmType + val trainingDataPath = params.trainingDataPath + val testDataPath = params.testDataPath + val numFeatures = params.numFeatures + val sparseOrDense = params.sparseOrDense + val regParam = params.regParam + val numIterations = params.numIterations + val tolerance = params.tolerance + + println(s"Initialized spark session.") + val t1 = System.currentTimeMillis() + + import spark.implicits._ + val trainData = spark.read.format("libsvm").option("vectorType", sparseOrDense).option("numFeatures", numFeatures) + .load(trainingDataPath).withColumn("label", when($"label" === -1.0, 0.0).otherwise($"label")) + .repartition(pt).cache() + println("trainData: " + trainData.count()) + val t2 = System.currentTimeMillis() + println("* after preprocess: " + t2) + + val fm = algorithmType match { + case "classification" =>{ + val modelFM = new FMClassifier() + .setRegParam(regParam) + .setMaxIter(numIterations) + .setTol(tolerance) + .setSeed(-2050267832) + modelFM + } + case "regression" =>{ + val modelFM = new FMRegressor() + .setRegParam(regParam) + .setMaxIter(numIterations) + .setTol(tolerance) + .setSeed(-2050267832) + modelFM + } + } + val model = fm.fit(trainData) + val t3 = System.currentTimeMillis() + println("* after train: " + t3) + + val testData = spark.read.format("libsvm").option("vectorType", sparseOrDense).option("numFeatures", numFeatures) + .load(testDataPath).withColumn("label", when($"label" === -1.0, 0.0).otherwise($"label")) + .repartition(pt).cache() + println("testData: " + testData.count()) + + val getSquaredError = udf((v1: Double, v2: Double) => { + math.pow((v1 - v2), 2) + }) + // Evaluate model on training examples and compute training error + val (res, t4) = algorithmType match { + case "classification" =>{ + val valuesAndPreds = model.transform(testData) + val accuracy = valuesAndPreds.filter($"label" === $"prediction").count().toDouble / valuesAndPreds.count + val t4 = System.currentTimeMillis() + println("* after predict: " + t4) + + println("\n--------success--------\n") + + val valuesAndPreds1 = model.transform(trainData) + val accuracy1 = valuesAndPreds1.filter($"label" === $"prediction").count().toDouble / valuesAndPreds1.count + println("trainData accuracy = " + accuracy1) + println("testData accuracy = " + accuracy) + (accuracy, t4) + } + case "regression" =>{ + val valuesAndPreds = model.transform(testData) + .withColumn("squaredError", getSquaredError($"label", $"prediction")) + .select("squaredError").summary("mean") + val rmse = math.sqrt(valuesAndPreds.select("squaredError").first().getString(0).toDouble) + val t4 = System.currentTimeMillis() + println("* after predict: " + t4) + println("\n--------success--------\n") + val valuesAndPreds1 = model.transform(trainData) + .withColumn("squaredError", getSquaredError($"label", $"prediction")) + .select("squaredError").summary("mean") + val rmse1 = math.sqrt(valuesAndPreds1.select("squaredError").first().getString(0).toDouble) + println("trainData rmse = " + rmse1) + println("testData rmse = " + rmse) + (rmse, t4) + } + } + + val trainingProcess = (t3 - t1).toDouble / 1000 + val trainingStep = (t3 - t2).toDouble / 1000 + val dataProcess = (t2 - t1).toDouble / 1000 + val predict = (t4 - t3).toDouble / 1000 + println("[s]train total: " + trainingProcess) + println("[s]data preprocess: " + dataProcess) + println("[s]train: " + trainingStep) + println("[s]predict: " + predict) + + Utils.saveEvaluation(res, params.saveDataPath, sc) + (res, trainingProcess) + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/FPGRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/FPGRunner.scala new file mode 100644 index 0000000..5ee5b26 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/FPGRunner.scala @@ -0,0 +1,184 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.FPGVerify + +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.apache.spark.internal.Logging +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.Row +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.ml.fpm.FPGrowth + +import java.io.FileWriter +import java.util +import java.util.Date +import scala.beans.BeanProperty + +class FPGConfig extends Serializable { + @BeanProperty var fpg: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class FPGParams extends Serializable { + @BeanProperty var pt: Int = _ + @BeanProperty var itemsCol: String = _ + @BeanProperty var minSupport: Double = _ + @BeanProperty var minConfidence: Double = _ + @BeanProperty var optLevel: Int = _ + @BeanProperty var timeLimit1: String = _ + @BeanProperty var timeLimit2: String = _ + + @BeanProperty var dataPath: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object FPGRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + val dataPath = args(1) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/fpg/fpg.yml") + val representer = new Representer + representer.addClassTag(classOf[FPGParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[FPGConfig]), representer, options) + val description = new TypeDescription(classOf[FPGParams]) + yaml.addTypeDescription(description) + val configs: FPGConfig = yaml.load(stream).asInstanceOf[FPGConfig] + val params = new FPGParams() + val paramsMap: util.HashMap[String, Object] = configs.fpg.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + params.setPt(paramsMap.getOrDefault("pt", "276").asInstanceOf[Int]) + params.setItemsCol(paramsMap.getOrDefault("itemsCol", "items").asInstanceOf[String]) + params.setMinSupport(paramsMap.getOrDefault("minSupport", "0.1").asInstanceOf[Double]) + params.setMinConfidence(paramsMap.getOrDefault("minConfidence", "0.8").asInstanceOf[Double]) + params.setDataPath(dataPath) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("FPG") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${datasetName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + if (isRaw.equals("no")) { + conf.set("spark.boostkit.ml.fpgrowth.optLevel", + paramsMap.get("optLevel").asInstanceOf[Int].toString) + conf.set("spark.boostkit.ml.fpgrowth.timeLimit1", + paramsMap.get("timeLimit1").asInstanceOf[String]) + conf.set("spark.boostkit.ml.fpgrowth.timeLimit2", + paramsMap.get("timeLimit2").asInstanceOf[String]) + } + val spark = SparkSession.builder.config(conf).getOrCreate() + val costTime = new FPGKernel().runJob(spark, params) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(FPGVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s;isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class FPGKernel { + def runJob(spark: SparkSession, params: FPGParams): Double = { + import spark.implicits._ + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val dataPath = params.dataPath + val pt = params.pt + val itemsCol = params.itemsCol + val minSupport = params.minSupport + val minConfidence = params.minConfidence + + val t1 = System.currentTimeMillis() + println("\n--------start--------\n") + println("* start: " + t1) + + // Load and parse the data + val oriSequences = sc.textFile(dataPath).map { line => + var items = Set.empty[Int] + line.split(" ").foreach {itemStr => + val item = itemStr.toInt + if (item >= 0) { + items += item + } + } + items + } + val trainData = sc.parallelize(oriSequences.collect(), pt).cache() + println("trainData: " + trainData.count()) + val t2 = System.currentTimeMillis() + println("* after preprocess: " + t2) + + val fpgrowth = new FPGrowth() + .setItemsCol(itemsCol) + .setMinSupport(minSupport) + .setMinConfidence(minConfidence) + + val sequence = trainData.toDF(itemsCol) + + val model = fpgrowth.fit(sequence) + model.freqItemsets.show() + model.associationRules.show() + val t3 = System.currentTimeMillis() + println("* after train: " + t3) + println("\n--------success--------\n") + val cnt = model.freqItemsets.count() + print(s"freqItemsets count = $cnt\n\n") + + val totalTaining = (t3 - t1).toDouble / 1000 + val coreTraining = (t3 - t2).toDouble / 1000 + val dataLoading = (t2 - t1).toDouble / 1000 + println("[s]end2end train: " + totalTaining) + println("[s]data preprocess: " + dataLoading) + println("[s]core train: " + coreTraining) + val df = model.freqItemsets.map(row => row.toString().sorted) + FPGVerify.saveRes(df, params.saveDataPath, sc) + totalTaining + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/GBDTRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/GBDTRunner.scala new file mode 100644 index 0000000..b58b900 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/GBDTRunner.scala @@ -0,0 +1,326 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.UpEvaluationVerify +import com.bigdata.compare.ml.DownEvaluationVerify + +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.ml.Pipeline +import org.apache.spark.ml.classification.GBTClassifier +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator +import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer} +import org.apache.spark.ml.regression.GBTRegressor +import org.apache.spark.ml.param.{ParamMap, ParamPair} +import org.apache.spark.ml.evaluation.RegressionEvaluator +import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.tree.configuration.BoostingStrategy +import org.apache.spark.mllib.tree.GradientBoostedTrees +import org.apache.hadoop.fs.{FileSystem, Path} + +import java.io.FileWriter +import java.io.File +import java.util.HashMap +import java.util +import scala.beans.BeanProperty + +class GBDTConfig extends Serializable { + + @BeanProperty var gbdt: util.HashMap[String, util.HashMap[String, Object]] = _ +} + +class GBDTParams extends Serializable { + + @BeanProperty var numPartitions: Int = _ + @BeanProperty var maxIter: Int = _ + @BeanProperty var maxDepth: Int = _ + @BeanProperty var maxBins: Int = _ + @BeanProperty var stepSize: Double = _ + @BeanProperty var cacheNodeIds: Boolean = _ + @BeanProperty var maxMemoryInMB: Int = _ + @BeanProperty var minInstancesPerNode: Int = _ + @BeanProperty var minInfoGain: Double = _ + @BeanProperty var subsamplingRate: Double = _ + @BeanProperty var featureSubsetStrategy: String = _ + + @BeanProperty var trainingDataPath: String = _ + @BeanProperty var testDataPath: String = _ + @BeanProperty var algorithmType: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var isRaw: String = "no" + @BeanProperty var evaluation: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + + +object GBDTRunner { + def main(args: Array[String]): Unit = { + + try { + val modelConfSplit = args(0).split("-") + val (algorithmType, dataStructure, datasetName, apiName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2), modelConfSplit(3), modelConfSplit(4), modelConfSplit(5)) + val dataPath = args(1) + val dataPathSplit = dataPath.split(",") + val (trainingDataPath, testDataPath) = (dataPathSplit(0), dataPathSplit(1)) + val saveResultPath = args(2) + + val stream = Utils.getStream("conf/ml/gbdt/gbdt.yml") + val representer = new Representer + representer.addClassTag(classOf[GBDTParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[GBDTConfig]), representer, options) + val description = new TypeDescription(classOf[GBDTParams]) + yaml.addTypeDescription(description) + val configs: GBDTConfig = yaml.load(stream).asInstanceOf[GBDTConfig] + val paramsMap = configs.gbdt.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName).asInstanceOf[HashMap[String, Object]] + val params = new GBDTParams() + params.setNumPartitions(paramsMap.get("numPartitions").asInstanceOf[Int]) + params.setMaxIter(paramsMap.get("maxIter").asInstanceOf[Int]) + params.setMaxDepth(paramsMap.get("maxDepth").asInstanceOf[Int]) + params.setMaxBins(paramsMap.get("maxBins").asInstanceOf[Int]) + params.setStepSize(paramsMap.get("stepSize").asInstanceOf[Double]) + params.setCacheNodeIds(paramsMap.get("cacheNodeIds").asInstanceOf[Boolean]) + params.setMaxMemoryInMB(paramsMap.get("maxMemoryInMB").asInstanceOf[Int]) + params.setMinInstancesPerNode(paramsMap.get("minInstancesPerNode").asInstanceOf[Int]) + params.setMinInfoGain(paramsMap.get("minInfoGain").asInstanceOf[Double]) + params.setSubsamplingRate(paramsMap.get("subsamplingRate").asInstanceOf[Double]) + params.setFeatureSubsetStrategy(paramsMap.get("featureSubsetStrategy").asInstanceOf[String]) + params.setAlgorithmType(algorithmType) + params.setApiName(apiName) + params.setTrainingDataPath(trainingDataPath) + params.setTestDataPath(testDataPath) + params.setDatasetName(datasetName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("GBDT") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${algorithmType}_${dataStructure}_${datasetName}_${apiName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${algorithmType}_${dataStructure}_${datasetName}_${apiName}" + if (isRaw == "yes") { + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + appName = s"${params.algorithmName}_${algorithmType}_${dataStructure}_${datasetName}_${apiName}_RAW" + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder.config(conf).getOrCreate() + + val (res, costTime) = dataStructure match { + case "dataframe" => new GBDTKernel().runDataframeJob(spark, params) + case "rdd" => new GBDTKernel().runRDDJob(spark, params) + } + params.setEvaluation(res) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + val isCorrect = params.algorithmType match { + case "classification" => UpEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark) + case "regression" => DownEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark) + } + params.setIsCorrect(isCorrect) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + + println(s"Exec Successful: costTime: ${costTime}s; evaluation: ${res};isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + + +class GBDTKernel { + + def runDataframeJob(spark: SparkSession, params: GBDTParams): (Double, Double) = { + val sc = spark.sparkContext + println(s"Initialized spark session.") + val startTime = System.currentTimeMillis() + + val trainingData = spark + .read + .format("libsvm") + .option("vectorType", "dense") + .load(params.trainingDataPath) + .repartition(params.numPartitions) + + val labelIndexer = new StringIndexer() + .setInputCol("label") + .setOutputCol("indexedLabel") + .fit(trainingData) + + val featureIndexer = new VectorIndexer() + .setInputCol("features") + .setOutputCol("indexedFeatures") + .setHandleInvalid("skip") + .setMaxCategories(2) + .fit(trainingData) + + val labelConverter = new IndexToString() + .setInputCol("prediction") + .setOutputCol("predictedLabel") + .setLabels(labelIndexer.labels) + + val gbdt = params.algorithmType match { + case "classification" =>{ + new GBTClassifier() + .setLabelCol("indexedLabel") + .setFeaturesCol("indexedFeatures") + .setMaxIter(params.maxIter) + .setMaxDepth(params.maxDepth) + .setMaxBins(params.maxBins) + .setStepSize(params.stepSize) + .setMinInstancesPerNode(params.minInstancesPerNode) + .setMinInfoGain(params.minInfoGain) + .setMaxMemoryInMB(params.maxMemoryInMB) + .setSubsamplingRate(params.subsamplingRate) + .setCacheNodeIds(params.cacheNodeIds) + .setFeatureSubsetStrategy(params.featureSubsetStrategy) + .setSeed(2020) + } + case "regression" =>{ + new GBTRegressor() + .setLabelCol("indexedLabel") + .setFeaturesCol("indexedFeatures") + .setMaxIter(params.maxIter) + .setMaxDepth(params.maxDepth) + .setMaxBins(params.maxBins) + .setStepSize(params.stepSize) + .setMinInstancesPerNode(params.minInstancesPerNode) + .setMinInfoGain(params.minInfoGain) + .setMaxMemoryInMB(params.maxMemoryInMB) + .setSubsamplingRate(params.subsamplingRate) + .setCacheNodeIds(params.cacheNodeIds) + .setFeatureSubsetStrategy(params.featureSubsetStrategy) + .setSeed(2020) + } + } + + val pipeline = new Pipeline() + .setStages(Array(labelIndexer, featureIndexer, gbdt, labelConverter)) + + val paramMap = ParamMap(gbdt.maxDepth -> params.maxDepth) + .put(gbdt.maxIter, params.maxIter) + val paramMaps: Array[ParamMap] = new Array[ParamMap](2) + for (i <- 0 to paramMaps.size -1) { + paramMaps(i) = ParamMap(gbdt.maxDepth -> params.maxDepth) + .put(gbdt.maxIter, params.maxIter) + } + val maxDepthParamPair = ParamPair(gbdt.maxDepth, params.maxDepth) + val maxIterParamPair = ParamPair(gbdt.maxIter, params.maxIter) + val maxBinsParamPair = ParamPair(gbdt.maxBins, params.maxBins) + + val model = params.apiName match { + case "fit" => pipeline.fit(trainingData) + case "fit1" => pipeline.fit(trainingData, paramMap) + case "fit2" => + val models = pipeline.fit(trainingData, paramMaps) + models(0) + case "fit3" => pipeline.fit(trainingData, maxDepthParamPair, maxIterParamPair, maxBinsParamPair) + + } + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + + val testData = spark + .read + .format("libsvm") + .option("vectorType", "dense") + .load(params.testDataPath) + .repartition(params.numPartitions) + // Make predictions. + val predictions = model.transform(testData) + // Select (prediction, true label) and compute test error. + val evaluator = params.algorithmType match { + case "classification" => + new MulticlassClassificationEvaluator() + .setLabelCol ("indexedLabel") + .setPredictionCol ("prediction") + .setMetricName ("accuracy") + case "regression" => + new RegressionEvaluator() + .setLabelCol ("indexedLabel") + .setPredictionCol ("prediction") + .setMetricName ("rmse") + } + val res = evaluator.evaluate(predictions) + Utils.saveEvaluation(res, params.saveDataPath, sc) + (res, costTime) + } + + def runRDDJob(spark: SparkSession, params: GBDTParams): (Double, Double) = { + val sc = spark.sparkContext + val startTime = System.currentTimeMillis() + val trainingData = MLUtils.loadLibSVMFile(sc, params.trainingDataPath).repartition(params.numPartitions) + val trainingLabelPositive = trainingData.map(i => if (i.label < 0) { + LabeledPoint(0.0, i.features) + } else { + LabeledPoint(i.label, i.features) + }) + + val boostingStrategy = params.algorithmType match { + case "classification" => BoostingStrategy.defaultParams("Classification") + case "regression" => BoostingStrategy.defaultParams("Regression") + } + + boostingStrategy.numIterations = params.maxIter + boostingStrategy.learningRate = params.stepSize + boostingStrategy.treeStrategy.maxDepth = params.maxDepth + boostingStrategy.treeStrategy.maxBins = params.maxBins + boostingStrategy.treeStrategy.minInstancesPerNode = params.minInstancesPerNode + boostingStrategy.treeStrategy.maxMemoryInMB = params.maxMemoryInMB + boostingStrategy.treeStrategy.subsamplingRate = params.subsamplingRate + boostingStrategy.treeStrategy.useNodeIdCache = params.cacheNodeIds + + val gbdt = new GradientBoostedTrees(boostingStrategy) + val model = params.apiName match { + case "rdd" => gbdt.run(trainingLabelPositive) + case "javardd" => gbdt.run(trainingLabelPositive.toJavaRDD) + } + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + + val testData = MLUtils.loadLibSVMFile(sc, params.testDataPath).repartition(params.numPartitions) + val testLabelPositive = testData.map(i => if (i.label < 0) { + LabeledPoint(0.0, i.features) + } else { + LabeledPoint(i.label, i.features) + }) + val labeleAndPreds = testLabelPositive.map{ point => + val prediction = model.predict(point.features) + (point.label, prediction) + } + val res = params.algorithmType match { + case "classification" => labeleAndPreds.filter(r => r._1 == r._2).count.toDouble / testLabelPositive.count() + case "regression" => math.sqrt(labeleAndPreds.map{ case(v, p) => math.pow((v - p), 2)}.mean()) + } + Utils.saveEvaluation(res, params.saveDataPath, sc) + (res, costTime) + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/HDBRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/HDBRunner.scala new file mode 100644 index 0000000..dadd8f9 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/HDBRunner.scala @@ -0,0 +1,170 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.UpEvaluationVerify + +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.ml.clustering.Hdbscan +import org.apache.spark.ml.evaluation.ClusteringEvaluator +import org.apache.spark.ml.linalg.{Vector, Vectors} + +import java.io.{File, FileWriter, PrintWriter} +import java.util +import java.util.Date +import scala.beans.BeanProperty +import scala.collection.mutable.ArrayBuffer + +class HDBConfig extends Serializable { + @BeanProperty var hdb: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class HDBParams extends Serializable { + @BeanProperty var pt: Int = _ + @BeanProperty var mstPartitionNum: Int = _ + @BeanProperty var seed: Int = _ + @BeanProperty var saurfangThreshold: Double = _ + + @BeanProperty var dataPath: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var standSilhouette: Double = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var startTime: Long = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object HDBRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + val dataPath = args(1) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/hdb/hdb.yml") + val representer = new Representer + representer.addClassTag(classOf[HDBParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[HDBConfig]), representer, options) + val description = new TypeDescription(classOf[HDBParams]) + yaml.addTypeDescription(description) + val configs: HDBConfig = yaml.load(stream).asInstanceOf[HDBConfig] + val paramsMap: util.HashMap[String, Object] = configs.hdb.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + val params = new HDBParams() + params.setPt(paramsMap.get("pt").asInstanceOf[Int]) + params.setMstPartitionNum(paramsMap.get("mstPartitionNum").asInstanceOf[Int]) + params.setSeed(paramsMap.get("seed").asInstanceOf[Int]) + params.setSaurfangThreshold(paramsMap.get("saurfangThreshold").asInstanceOf[Double]) + params.setDataPath(dataPath) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("HDB") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${datasetName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder.config(conf).getOrCreate() + spark.sparkContext.setLogLevel("ERROR") + val (res, costTime) = new HDBKernel().runJob(spark, params) + params.setStandSilhouette(res) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(UpEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}min; standSilhouette: ${res};isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class HDBKernel { + def runJob(spark: SparkSession, params: HDBParams): (Double, Double) = { + val sc = spark.sparkContext + sc.setLogLevel("ERROR") + + val startTime = System.currentTimeMillis() + + println("\n--------start--------\n") + + // val dataRDD = sc.textFile(dataPath).map + // { + // t=>t.split(",").map{t=>t.toDouble} + // }.repartition(pt).persist() + + import scala.io.{BufferedSource, Source} + val data = new ArrayBuffer[Vector]() + val source2: BufferedSource = Source.fromFile(params.dataPath) + for (line <- source2.getLines()){//source.getLines()获取所有的行 + data.append(Vectors.dense(line.split(",").map{_.toDouble})) + } + + val d1 = data.toArray + val dataRDD = sc.parallelize(d1).repartition(params.pt).cache() + + println("count: "+ dataRDD.count()) + println("dim: " + dataRDD.first().size) + val t1 = System.currentTimeMillis() + println("map Cost[min]: " + (t1 - startTime).toDouble/60/1000) + + val hdb = new Hdbscan() + .setMstPartitionNum(params.mstPartitionNum) + .setSaurfangThreshold(params.saurfangThreshold) + .setRandomSeed(params.seed) + val labels = hdb.fit(dataRDD) + val t2 = System.currentTimeMillis() + println("train Cost[min]: " + (t2 - t1).toDouble/60/1000) + println("total Cost[min]: " + (t2 - startTime).toDouble/60/1000) + + import spark.implicits._ + val valid = labels.map{t => (t._2, t._3)}.toDF("features", "prediction") + val evaluator = new ClusteringEvaluator() + val silhouette = evaluator.evaluate(valid) + val standSilhouette = (silhouette + 1) / 2.0 + println(s"Silhouette with squared euclidean distance = $standSilhouette") + // labels.map{t=>(t._3,1)}.reduceByKey{(x,y)=>x+y}.collect().foreach(println) + println("\n--------success--------\n") + Utils.saveEvaluation(standSilhouette, params.saveDataPath, sc) + val costTime = (t2 - startTime).toDouble/60/1000 + (standSilhouette, costTime) + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/IDFRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/IDFRunner.scala new file mode 100644 index 0000000..cc70db2 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/IDFRunner.scala @@ -0,0 +1,141 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.IDFVerify + +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.SparkConf +import org.apache.spark.sql.{Encoders, SparkSession} +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.feature.{IDF, IDFModel} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.io.{File, FileInputStream, FileOutputStream, FileWriter, ObjectInputStream, ObjectOutputStream} +import java.util +import scala.beans.BeanProperty + +class IDFConfig extends Serializable { + @BeanProperty var idf: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class IDFParams extends Serializable { + @BeanProperty var pt: Int = _ + @BeanProperty var combineStrategy: String = _ + @BeanProperty var fetchMethod: String = _ + @BeanProperty var orcFormat: Boolean = _ + + @BeanProperty var dataPath: String = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + + +object IDFRunner{ + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + val dataPath = args(1) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/idf/idf.yml") + val representer = new Representer + representer.addClassTag(classOf[IDFParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[IDFConfig]), representer, options) + val description = new TypeDescription(classOf[IDFParams]) + yaml.addTypeDescription(description) + val configs: IDFConfig = yaml.load(stream).asInstanceOf[IDFConfig] + val paramsMap: util.HashMap[String, Object] = configs.idf.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + val params = new IDFParams() + params.setPt(paramsMap.get("pt").asInstanceOf[Int]) + params.setCombineStrategy(paramsMap.get("combineStrategy").asInstanceOf[String]) + params.setFetchMethod(paramsMap.get("fetchMethod").asInstanceOf[String]) + params.setOrcFormat(paramsMap.get("orcFormat").asInstanceOf[Boolean]) + params.setDataPath(dataPath) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("IDF") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}" + if (isRaw.equals("yes")){ + var appName = s"${params.algorithmName}_${datasetName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + conf.set("spark.driver.maxResultSize", "256G") + if (isRaw.equals("no")){ + conf.set("spark.sophon.ml.idf.combineStrategy", + paramsMap.get("combineStrategy").asInstanceOf[String]) + conf.set("spark.sophon.ml.idf.fetchMethod", + paramsMap.get("fetchMethod").asInstanceOf[String]) + } + val spark = SparkSession.builder().config(conf).getOrCreate() + val costTime = new IDFKernel().runJob(spark, params) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(IDFVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime};isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } + +} + +class IDFKernel { + def runJob(spark: SparkSession, params: IDFParams): Double = { + val sc = spark.sparkContext + val startTime = System.currentTimeMillis() + val orcData = spark.read.schema(Encoders.product[DocSchema].schema).format("orc").load(params.dataPath) + val data = if (params.pt > 0){ + orcData.select("tf").repartition(params.pt) + } else { + orcData.select("tf") + } + val idf = new IDF().setInputCol("tf").setOutputCol("tf_idf") + val model = idf.fit(data) + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + val res = model.idf.toArray + IDFVerify.saveRes(res, params.saveDataPath, sc) + costTime + } + case class DocSchema(id: Long, tf: Vector) +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/IFRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/IFRunner.scala new file mode 100644 index 0000000..2017d6e --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/IFRunner.scala @@ -0,0 +1,205 @@ +package com.bigdata.ml + +import com.huawei.bigdata.alogrithms.isolationforest._ +import com.linkedin.relevance.isolationforest.{IsolationForest => LinkedinIsolationForest} +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.UpEvaluationVerify + +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.mllib.evaluation.BinaryClassificationMetrics +import org.apache.spark.sql.Row + +import java.io.FileWriter +import java.util +import java.util.Date +import scala.beans.BeanProperty + +class IFConfig extends Serializable { + @BeanProperty var IF: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class IFParams extends Serializable { + @BeanProperty var pt: Int = _ + @BeanProperty var numTrees: Int = _ + @BeanProperty var bootstrap: Boolean = _ + @BeanProperty var maxInstances: Int = _ + @BeanProperty var maxFea: Double = _ + @BeanProperty var featuresCol: String = _ + @BeanProperty var predictionCol: String = _ + @BeanProperty var scoreCol: String = _ + @BeanProperty var contamination: Double = _ + @BeanProperty var randomSeed: Int = _ + + @BeanProperty var dataPath: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var auROC: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object IFRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + val dataPath = args(1) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/if/if.yml") + val representer = new Representer + representer.addClassTag(classOf[IFParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[IFConfig]), representer, options) + val description = new TypeDescription(classOf[IFParams]) + yaml.addTypeDescription(description) + val configs: IFConfig = yaml.load(stream).asInstanceOf[IFConfig] + val params = new IFParams() + val paramsMap: util.HashMap[String, Object] = configs.IF.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + params.setPt(paramsMap.getOrDefault("pt", "280").asInstanceOf[Int]) + params.setNumTrees(paramsMap.getOrDefault("numTrees", "100").asInstanceOf[Int]) + params.setBootstrap(paramsMap.getOrDefault("bootstrap", "false").asInstanceOf[Boolean]) + params.setMaxInstances(paramsMap.getOrDefault("maxInstances", "256").asInstanceOf[Int]) + params.setMaxFea(paramsMap.getOrDefault("maxFea", "1.0").asInstanceOf[Double]) + params.setFeaturesCol(paramsMap.getOrDefault("featuresCol", "features").asInstanceOf[String]) + params.setPredictionCol(paramsMap.getOrDefault("predictionCol", "predictedLabel").asInstanceOf[String]) + params.setScoreCol(paramsMap.getOrDefault("scoreCol", "anomalyScore").asInstanceOf[String]) + params.setContamination(paramsMap.getOrDefault("contamination", "0.1").asInstanceOf[Double]) + params.setRandomSeed(paramsMap.getOrDefault("randomSeed", "11").asInstanceOf[Int]) + params.setDataPath(dataPath) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("IF") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${datasetName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder.config(conf).getOrCreate() + val (auROC, costTime) = new IFKernel().runJob(spark, params) + params.setAuROC(auROC) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(UpEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s; auROC: ${auROC};isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class IFKernel { + def runJob(spark: SparkSession, params: IFParams): (Double, Double) = { + import spark.implicits._ + val sc = spark.sparkContext + val dataPath = params.dataPath + val pt = params.pt + val numTrees = params.numTrees + val bootstrap = params.bootstrap + val maxInstances = params.maxInstances + val maxFea = params.maxFea + val featuresCol = params.featuresCol + val predictionCol = params.predictionCol + val scoreCol = params.scoreCol + val contamination = params.contamination + val randomSeed = params.randomSeed + + val startTime = System.currentTimeMillis() + val data0 = sc.textFile(dataPath).repartition(pt).map { line => + val parts = line.split(',') + (Vectors.dense(parts(1).split(' ').map(_.toDouble)), parts(0).toDouble) + }.cache() + val data = data0.toDF("features", "label") + println(data.count()) + + val isolationForest = params.isRaw match { + case "yes" => { + val IF = new LinkedinIsolationForest() + .setNumEstimators(numTrees) + .setBootstrap(bootstrap) + .setMaxSamples(maxInstances) + .setMaxFeatures(maxFea) + .setFeaturesCol(featuresCol) + .setPredictionCol(predictionCol) + .setScoreCol(scoreCol) + .setContamination(contamination) + .setContaminationError(0.01 * contamination) + .setRandomSeed(randomSeed) + IF + } + case "no" => { + val IF = new IsolationForest() + .setNumTrees(numTrees) + .setBootstrap(bootstrap) + .setMaxInstances(maxInstances) + .setMaxFea(maxFea) + .setFeaturesCol(featuresCol) + .setPredictionCol(predictionCol) + .setScoreCol(scoreCol) + .setAnomalyRatio(contamination) + .setAnomalyRatioError(0.01 * contamination) + .setRandomSeed(randomSeed) + IF + } + } + + val isolationForestModel = isolationForest.fit(data) + val dataWithScores = isolationForestModel.transform(data).cache() + println(dataWithScores.count()) + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + + val predictionAndLabels = dataWithScores.select("anomalyScore", "label") + .rdd.map { + case Row(score: Float, label: Double) => + (score.toDouble, label) + case Row(score: Double, label: Double) => + (score, label) + } + + val metrics = new BinaryClassificationMetrics(predictionAndLabels) + val auROC = metrics.areaUnderROC + println("Area under ROC = " + auROC) + Utils.saveEvaluation(auROC, params.saveDataPath, sc) + (auROC, costTime) + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/KMeansRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/KMeansRunner.scala new file mode 100644 index 0000000..85aa91a --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/KMeansRunner.scala @@ -0,0 +1,200 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.DownEvaluationVerify + +import org.apache.hadoop.io.LongWritable +import org.apache.mahout.math.VectorWritable +import org.apache.spark.ml.clustering.{KMeans => MlKMeans} +import org.apache.spark.ml.linalg.{Vectors => MlVectors} +import org.apache.spark.ml.param.{ParamMap, ParamPair} +import org.apache.spark.mllib.clustering.{KMeans => MlibKMeans} +import org.apache.spark.mllib.linalg.{Vectors => MlibVectors} +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.functions.udf +import org.apache.hadoop.fs.{FileSystem, Path} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.util +import scala.beans.BeanProperty +import java.io.{File, FileWriter} +import java.util.HashMap + +class KMeansConfig extends Serializable { + + @BeanProperty var kmeans: util.HashMap[String, util.HashMap[String, Object]] = _ +} + +class KMeansParams extends Serializable { + + @BeanProperty var numPartitions: Int = _ + @BeanProperty var maxIterations: Int = _ + @BeanProperty var k: Int = _ + + @BeanProperty var dataPath: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var datasetCpuName: String = _ + @BeanProperty var isRaw: String = "no" + @BeanProperty var evaluation: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var loadDataTime: Double = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object KMeansRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (dataStructure, datasetName, apiName, cpuName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2), modelConfSplit(3), modelConfSplit(4), modelConfSplit(5)) + val dataPath = args(1) + val datasetCpuName = s"${datasetName}_${cpuName}" + val saveResultPath = args(2) + + val stream = Utils.getStream("conf/ml/kmeans/kmeans.yml") + val representer = new Representer + representer.addClassTag(classOf[KMeansParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[KMeansConfig]), representer, options) + val description = new TypeDescription(classOf[KMeansParams]) + yaml.addTypeDescription(description) + val config: KMeansConfig = yaml.load(stream).asInstanceOf[KMeansConfig] + val paramsMap: util.HashMap[String, Object] = config.kmeans.get(datasetCpuName) + val params = new KMeansParams() + params.setNumPartitions(paramsMap.get("numPartitions").asInstanceOf[Int]) + params.setMaxIterations(paramsMap.get("maxIterations").asInstanceOf[Int]) + params.setK(paramsMap.get("k").asInstanceOf[Int]) + params.setApiName(apiName) + params.setDataPath(dataPath) + params.setDatasetName(datasetName) + params.setDatasetCpuName(datasetCpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("KMeans") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}_${dataStructure}_${apiName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}_${dataStructure}_${apiName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${datasetName}_${dataStructure}_${apiName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder.config(conf).getOrCreate() + + val (res, costTime) = dataStructure match { + case "dataframe" => new KMeansKernel().runDataFrameJob(spark, params) + case "rdd" => new KMeansKernel().runRDDJob(spark, params) + } + params.setEvaluation(res) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(DownEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s; evaluation: ${res};isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class KMeansKernel { + + def runDataFrameJob(spark: SparkSession, params: KMeansParams): (Double, Double) = { + + val sc = spark.sparkContext + val startTime = System.currentTimeMillis() + val data = sc.sequenceFile[LongWritable, VectorWritable](params.dataPath) + val dataRDD = data.map{ case (k, v) => + var vector: Array[Double] = new Array[Double](v.get().size) + for (i <- 0 until v.get().size) vector(i) = v.get().get(i) + vector + }.repartition(params.numPartitions).persist() + + import spark.implicits._ + val dataDF = dataRDD.toDF("features") + val convertToVector = udf((array: Seq[Double]) => { + MlVectors.dense(array.toArray) + }) + val trainingData = dataDF.withColumn("features", convertToVector($"features")) + println("count: " + trainingData.count()) + val loadDataTime = (System.currentTimeMillis() - startTime) / 1000.0 + val kmeans = new MlKMeans().setK(params.k).setMaxIter(params.maxIterations) + + val paramMap = ParamMap(kmeans.k -> params.k) + .put(kmeans.maxIter, params.maxIterations) + val paramMaps: Array[ParamMap] = new Array[ParamMap](2) + for (i <- 0 to paramMaps.size -1) { + paramMaps(i) = ParamMap(kmeans.k -> params.k) + .put(kmeans.maxIter, params.maxIterations) + } + + + val maxIterParamPair = ParamPair(kmeans.maxIter, params.maxIterations) + val kPair = ParamPair(kmeans.k, params.k) + val model = params.apiName match { + case "fit" => kmeans.fit(trainingData) + case "fit1" => kmeans.fit(trainingData, paramMap) + case "fit2" => + val models = kmeans.fit(trainingData, paramMaps) + models(0) + case "fit3" => kmeans.fit(trainingData, kPair, maxIterParamPair) + } + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setLoadDataTime(loadDataTime) + val res = model.computeCost(trainingData) + Utils.saveEvaluation(res, params.saveDataPath, sc) + (res, costTime) + } + + def runRDDJob(spark: SparkSession, params: KMeansParams): (Double, Double) = { + + val sc = spark.sparkContext + val startTime = System.currentTimeMillis() + val data = sc.sequenceFile[LongWritable, VectorWritable](params.dataPath) + val dataRDD = data.map{ case (k, v) => + var vector: Array[Double] = new Array[Double](v.get().size) + for (i <- 0 until v.get().size) vector(i) = v.get().get(i) + MlibVectors.dense(vector) + }.repartition(params.numPartitions).cache() + println("count: " + dataRDD.count()) + val loadDataTime = (System.currentTimeMillis() - startTime) / 1000.0 + + val model = new MlibKMeans() + .setK(params.k) + .setMaxIterations(params.maxIterations) + .run(dataRDD) + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + + params.setLoadDataTime(loadDataTime) + val res = model.computeCost(dataRDD) + Utils.saveEvaluation(res, params.saveDataPath, sc) + (res, costTime) + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/KNNRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/KNNRunner.scala new file mode 100644 index 0000000..5c0dd27 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/KNNRunner.scala @@ -0,0 +1,244 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils + +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.ml.neighbors.KNN +import org.apache.spark.ml.linalg.Vectors +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.io.{File, FileWriter, PrintWriter} +import java.util +import scala.beans.BeanProperty + +class KNNConfig extends Serializable { + + @BeanProperty var knn: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class KNNParams extends Serializable { + + @BeanProperty var pt: Int = _ + @BeanProperty var k: Int = _ + @BeanProperty var testNum: Int = _ + @BeanProperty var testBatchSize: Int = _ + @BeanProperty var featuresCol: String = _ + @BeanProperty var distanceCol: String = _ + @BeanProperty var neighborsCol: String = _ + @BeanProperty var topTreeSizeRate: Double = _ + @BeanProperty var topTreeLeafSize: Int = _ + @BeanProperty var subTreeLeafSize: Int = _ + + @BeanProperty var inputDataPath: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var loadDataTime: Double = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ +} + +object KNNRunner { + def main(args: Array[String]): Unit = { + + try { + val modelConfSplit = args(0).split("-") + val (datasetName, isRaw) = + (modelConfSplit(0), modelConfSplit(1)) + val inputDataPath = args(1) + val cpuName = args(2) + val sparkConfSplit = args(3).split("_") + val (master, deployMode, numExec, execCores, execMem) = + (sparkConfSplit(0), sparkConfSplit(1), sparkConfSplit(2), sparkConfSplit(3), sparkConfSplit(4)) + val saveResultPath = args(4) + + val stream = Utils.getStream("conf/ml/knn/knn.yml") + val representer = new Representer + representer.addClassTag(classOf[KNNParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[KNNConfig]), representer, options) + val description = new TypeDescription(classOf[KNNParams]) + yaml.addTypeDescription(description) + val configs: KNNConfig = yaml.load(stream).asInstanceOf[KNNConfig] + val params = new KNNParams() + val paramsMap: util.HashMap[String, Object] = configs.knn.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + params.setPt(paramsMap.getOrDefault("pt", "200").asInstanceOf[Int]) + params.setK(paramsMap.getOrDefault("k", "10").asInstanceOf[Int]) + params.setTestNum(paramsMap.getOrDefault("testNum", "100000").asInstanceOf[Int]) + params.setTestBatchSize(paramsMap.getOrDefault("testBatchSize", "10").asInstanceOf[Int]) + params.setTopTreeSizeRate(paramsMap.getOrDefault("topTreeSizeRate", "10.0").asInstanceOf[Double]) + params.setTopTreeLeafSize(paramsMap.getOrDefault("topTreeLeafSize", "10").asInstanceOf[Int]) + params.setSubTreeLeafSize(paramsMap.getOrDefault("subTreeLeafSize", "30").asInstanceOf[Int]) + params.setFeaturesCol("features") + params.setDistanceCol("distances") + params.setNeighborsCol("neighbors") + params.setInputDataPath(inputDataPath) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setAlgorithmName("KNN") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${datasetName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName).setMaster(master) + val commonParas = Array ( + ("spark.submit.deployMode", deployMode), + ("spark.executor.instances", numExec), + ("spark.executor.cores", execCores), + ("spark.executor.memory", execMem) + ) + conf.setAll(commonParas) + val spark = SparkSession.builder.config(conf).getOrCreate() + + val costTime = if (isRaw == "no") { + new KNNKernel().runJob(spark, params) + } else { + new KNNKernel().runRawJob(spark, params) + } + params.setCostTime(costTime) + + Utils.checkDirs("report") + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class KNNKernel { + + def runJob(spark: SparkSession, params: KNNParams): Double = { + + import spark.implicits._ + val startTime = System.currentTimeMillis() + val dataPath = params.inputDataPath + val featuresCol = params.featuresCol + val testNum = params.testNum + val pt = params.pt + val neighborsCol = params.neighborsCol + val distanceCol = params.distanceCol + val testBatchSize = params.testBatchSize + val k = params.k + + //read data + val rawData = spark.sparkContext.textFile(dataPath) + .map(line => { + val arr = line.split("\t") + val id = arr(0).toLong + val feature = Vectors.dense(arr(1).split(",").map(_.toDouble)) + (id, feature) + }).toDF("id", featuresCol).cache() + + //split train/test datasets + val trainDataDF = rawData.filter($"id" >= testNum).repartition(pt).cache() + val testDataDF = rawData.filter($"id" < testNum).repartition(pt).cache() + trainDataDF.count() + testDataDF.count() + + val loadDataTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setLoadDataTime(loadDataTime) + + //fit + val model = new KNN() + .setFeaturesCol(featuresCol) + .setAuxiliaryCols(Array("id")) + .fit(trainDataDF) + + //transform + val testResults = model + .setNeighborsCol(neighborsCol) + .setDistanceCol(distanceCol) + .setK(k) + .setTestBatchSize(testBatchSize) + .transform(testDataDF).cache() + testResults.count() + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + + costTime + } + + def runRawJob(spark: SparkSession, params: KNNParams): Double = { + + import spark.implicits._ + val startTime = System.currentTimeMillis() + val dataPath = params.inputDataPath + val featuresCol = params.featuresCol + val testNum = params.testNum + val pt = params.pt + val neighborsCol = params.neighborsCol + val distanceCol = params.distanceCol + val k = params.k + val topTreeSizeRate = params.topTreeSizeRate + val topTreeLeafSize = params.topTreeLeafSize + val subTreeLeafSize = params.subTreeLeafSize + + //read data + val rawData = spark.sparkContext.textFile(dataPath) + .map(line => { + val arr = line.split("\t") + val id = arr(0).toLong + val feature = Vectors.dense(arr(1).split(",").map(_.toDouble)) + (id, feature) + }).toDF("id", featuresCol).cache() + + //split train/test datasets + val trainDataDF = rawData.filter($"id" >= testNum).repartition(pt).cache() + val testDataDF = rawData.filter($"id" < testNum).repartition(pt).cache() + trainDataDF.count() + testDataDF.count() + + val loadDataTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setLoadDataTime(loadDataTime) + + //fit + import org.apache.spark.ml.knn.KNN + val model = new KNN() + .setTopTreeSize((pt * topTreeSizeRate).toInt) + .setTopTreeLeafSize(topTreeLeafSize) + .setSubTreeLeafSize(subTreeLeafSize) + .setBalanceThreshold(0.0) + .setFeaturesCol(featuresCol) + .setAuxCols(Array("id")) + .fit(trainDataDF) + + //transform + val testResults = model + .setBufferSize(Double.MaxValue) + .setNeighborsCol(neighborsCol) + .setDistanceCol(distanceCol) + .setK(k) + .transform(testDataDF).cache() + testResults.count() + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + + costTime + } + +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/LDARunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/LDARunner.scala new file mode 100644 index 0000000..d383ef9 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/LDARunner.scala @@ -0,0 +1,282 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.UpEvaluationVerify + +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.ml.clustering.{LDAModel, LDA => MLLDA} +import org.apache.spark.ml.param.{ParamMap, ParamPair} +import org.apache.spark.rdd.RDD +import org.apache.spark.mllib.linalg.{Vector => MLlibVector} +import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.mllib.clustering.{LocalLDAModel, LDA => MLlibLDA} +import org.apache.spark.sql.SparkSession +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.io.{File, FileWriter} +import java.util.HashMap +import java.util +import scala.beans.BeanProperty + + +class LDAConfig extends Serializable { + + @BeanProperty var lda: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class LDAParams extends Serializable { + + @BeanProperty var numPartitions: Int = _ + @BeanProperty var numPartitionsTest: Int = _ + @BeanProperty var numFeatures: Int = _ + @BeanProperty var checkpointInterval: Int = _ + @BeanProperty var inputDataType: String = _ + @BeanProperty var maxIter: Int = _ + @BeanProperty var k: Int = _ + + @BeanProperty var trainingDataPath: String = _ + @BeanProperty var testDataPath: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var dataStructure: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = "no" + @BeanProperty var costTime: Double = _ + @BeanProperty var loadDataTime: Double = _ + @BeanProperty var evaluation: Double = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object LDARunner { + + def main(args: Array[String]): Unit = { + + try { + val modelConfSplit = args(0).split("-") + val (dataStructure, datasetName, apiName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2), modelConfSplit(3), modelConfSplit(4)) + val dataPath = args(1) + val dataPathSplit = dataPath.split(",") + val (trainingDataPath, testDataPath) = (dataPathSplit(0), dataPathSplit(1)) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/lda/lda.yml") + val representer = new Representer + representer.addClassTag(classOf[LDAParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[LDAConfig]), representer, options) + val description = new TypeDescription(classOf[LDAParams]) + yaml.addTypeDescription(description) + val config: LDAConfig = yaml.load(stream).asInstanceOf[LDAConfig] + val paramsMap: util.HashMap[String, Object] = config.lda.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + val params = new LDAParams() + params.setNumPartitions(paramsMap.get("numPartitions").asInstanceOf[Int]) + params.setNumPartitionsTest(paramsMap.get("numPartitionsTest").asInstanceOf[Int]) + params.setNumFeatures(paramsMap.get("numFeatures").asInstanceOf[Int]) + params.setCheckpointInterval(paramsMap.get("checkpointInterval").asInstanceOf[Int]) + params.setInputDataType(paramsMap.get("inputDataType").asInstanceOf[String]) + params.setMaxIter(paramsMap.get("maxIter").asInstanceOf[Int]) + params.setK(paramsMap.get("k").asInstanceOf[Int]) + params.setApiName(apiName) + params.setTrainingDataPath(trainingDataPath) + params.setTestDataPath(testDataPath) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setDataStructure(dataStructure) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("LDA") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}_${dataStructure}_${apiName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${dataStructure}_${datasetName}_${apiName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${dataStructure}_${datasetName}_${apiName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder.config(conf).getOrCreate() + + val (res, costTime) = dataStructure match { + case "dataframe" => { + val (trainModel, dataFrameCostTime) = new LDAKernel().runDataFrameJob(spark, params) + val confPredict = new SparkConf() + .setAppName(s"LDA_${dataStructure}_${datasetName}_${apiName}_${cpuName}_predict").set("spark.task.cpus", "1") + //val sparkPredict = SparkSession.builder.config(confPredict).getOrCreate() + val dataFrameRes = new LDAKernel().runPredictJob(spark, params, trainModel) + (dataFrameRes, dataFrameCostTime) + } + case "rdd" => (0.0, new LDAKernel().runRDDJob(spark, params)) + } + params.setEvaluation(res) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(UpEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s; evaluation: ${res};isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + + +class LDAKernel { + + def runDataFrameJob(spark: SparkSession, params: LDAParams): (LDAModel, Double) = { + val sc = spark.sparkContext + val startTime = System.currentTimeMillis() + val trainingData = if (params.inputDataType == "libsvm") { + spark.read.format("libsvm") + .option("numFeatures", params.numFeatures) + .load(params.trainingDataPath) + .repartition(params.numPartitions) + .persist() + } else { + import spark.implicits._ + val corpus: RDD[(Long, MLlibVector)] = sc.objectFile(params.trainingDataPath) + corpus + .map(i => (i._1, i._2.asML)) + .toDF() + .repartition(params.numPartitions) + .withColumnRenamed("_1", "label") + .withColumnRenamed("_2", "features") + .persist() + } + + println("count: " + trainingData.count()) + val loadDataTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setLoadDataTime(loadDataTime) + val lda = new MLLDA() + .setSeed(2020L) + .setK(params.k) + .setMaxIter(params.maxIter) + .setCheckpointInterval(params.checkpointInterval) + + val paramMap = ParamMap(lda.k -> params.k) + .put(lda.maxIter, params.maxIter) + + val paramMaps: Array[ParamMap] = new Array[ParamMap](2) + for (i <- 0 to paramMaps.size - 1) { + paramMaps(i) = ParamMap(lda.k -> params.k) + .put(lda.maxIter, params.maxIter) + } + val maxIterParamPair = ParamPair(lda.maxIter, params.maxIter) + val kPair = ParamPair(lda.k, params.k) + val model = params.apiName match { + case "fit" => lda.fit(trainingData) + case "fit1" => lda.fit(trainingData, paramMap) + case "fit2" => + val models = lda.fit(trainingData, paramMaps) + models(0) + case "fit3" => lda.fit(trainingData, kPair, maxIterParamPair) + } + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + println("load and train costTime: " + costTime) + + trainingData.unpersist() + //spark.close() + (model, costTime) + } + + def runPredictJob(spark: SparkSession, params: LDAParams, model: LDAModel): Double = { + val sc = spark.sparkContext + + println("====== Start Predict =======: ") + val startTime = System.currentTimeMillis() + val testData = if (params.inputDataType == "libsvm") { + spark.read.format("libsvm") + .option("numFeatures", params.numFeatures) + .load(params.testDataPath) + .repartition(params.numPartitionsTest) + .persist() + } else { + import spark.implicits._ + val corpus: RDD[(Long, MLlibVector)] = sc.objectFile(params.testDataPath) + corpus + .map(i => (i._1, i._2.asML)) + .toDF() + .repartition(params.numPartitionsTest) + .withColumnRenamed("_1", "label") + .withColumnRenamed("_2", "features") + .persist() + } + val testDataSize = testData.count() + val fraction = if (testDataSize <= 10000) { + 1.0 + } else { + 10000.0 / testDataSize.toDouble + } + val testDataSample = testData.sample(fraction, 2020L) + + val res = model.logLikelihood(testDataSample) + println("====== End Predict =======: ") + val predictTime = (System.currentTimeMillis() - startTime) / 1000.0 + println("Predict costTime: " + (predictTime)) + + testData.unpersist() + var result = new java.math.BigDecimal(res) + Utils.saveLDARes(result, params.saveDataPath, sc) + res + } + + def runRDDJob(spark: SparkSession, params: LDAParams): Double = { + val sc = spark.sparkContext + val startTime = System.currentTimeMillis() + val trainingData = if (params.inputDataType == "libsvm") { + MLUtils.loadLibSVMFile(sc, params.trainingDataPath, params.numFeatures) + .map(i => (i.label.toLong, i.features)) + .repartition(params.numPartitions) + .persist() + } else { + val corpus: RDD[(Long, MLlibVector)] = sc.objectFile(params.trainingDataPath) + corpus + .repartition(params.numPartitions) + .persist() + } + println("count: " + trainingData.count()) + val loadDataTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setLoadDataTime(loadDataTime) + + val model = new MLlibLDA() + .setSeed(2020L) + .setK(params.k) + .setMaxIterations(params.maxIter) + .setCheckpointInterval(params.checkpointInterval) + .setOptimizer("online") + .run(trainingData) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + trainingData.unpersist() + Utils.saveEvaluation(0.0, params.saveDataPath, sc) + costTime + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/LightGBMRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/LightGBMRunner.scala new file mode 100644 index 0000000..34a0368 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/LightGBMRunner.scala @@ -0,0 +1,283 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.UpEvaluationVerify +import com.bigdata.compare.ml.DownEvaluationVerify + +import com.microsoft.ml.spark.core.metrics.MetricConstants +import com.microsoft.ml.spark.train.ComputeModelStatistics +import com.microsoft.ml.spark.lightgbm.{LightGBMClassifier, LightGBMRegressor} +import com.typesafe.config.{Config, ConfigFactory} +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.sql.{Dataset, Row, SparkSession} +import org.apache.spark.SparkConf +import org.apache.spark.storage.StorageLevel +import org.apache.spark.ml.evaluation.MulticlassClassificationEvaluator +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.lang.System.nanoTime +import java.io.{File, FileWriter, PrintWriter} +import java.nio.file.{Paths, Files} +import java.util +import scala.beans.BeanProperty +import scala.util.Random + +class LightGBMConfig extends Serializable { + @BeanProperty var lgbm: util.HashMap[String, util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]]] = _ +} + +class LightGBMParams extends Serializable { + @BeanProperty var objective: String = _ + @BeanProperty var labelCol: String = _ + @BeanProperty var featuresCol: String = _ + @BeanProperty var verbosity: Int = _ + @BeanProperty var learningRate: Double = _ + @BeanProperty var maxDepth: Int = _ + @BeanProperty var maxBin: Int = _ + @BeanProperty var numIterations: Int = _ + @BeanProperty var numTasks: Int = _ + @BeanProperty var minGainToSplit: Double = _ + @BeanProperty var lambdaL2: Double = _ + @BeanProperty var numLeaves: Int = _ + @BeanProperty var minSumHessianInLeaf: Double = _ + @BeanProperty var minDataInLeaf: Int = _ + @BeanProperty var baggingFraction: Double = _ + @BeanProperty var baggingFreq: Int = _ + @BeanProperty var numThreads: Int = _ + @BeanProperty var networkCompression: Int = _ + @BeanProperty var histSynchAlgo: Int = _ + @BeanProperty var loglossApx: Int = _ + @BeanProperty var loglossApxEps: Double = _ + @BeanProperty var loadingBalance: String = _ + + @BeanProperty var trainingDataPath: String = _ + @BeanProperty var testDataPath: String = _ + @BeanProperty var algorithmType: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var evaluation: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object LightGBMRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (algorithmType, datasetName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2), modelConfSplit(3)) + val dataPath = args(1) + val dataPathSplit = dataPath.split(",") + val (trainingDataPath, testDataPath) = (dataPathSplit(0), dataPathSplit(1)) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/lgbm/lgbm.yml") + val representer = new Representer + representer.addClassTag(classOf[LightGBMParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[LightGBMConfig]), representer, options) + val description = new TypeDescription(classOf[LightGBMParams]) + yaml.addTypeDescription(description) + val configs: LightGBMConfig = yaml.load(stream).asInstanceOf[LightGBMConfig] + val params = new LightGBMParams() + val paramsMap: util.HashMap[String, Object] = configs.lgbm.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(algorithmType).get(datasetName) + params.setObjective(paramsMap.get("objective").asInstanceOf[String]) + params.setLabelCol(paramsMap.get("labelCol").asInstanceOf[String]) + params.setFeaturesCol(paramsMap.get("featuresCol").asInstanceOf[String]) + params.setVerbosity(paramsMap.get("verbosity").asInstanceOf[Int]) + params.setLearningRate(paramsMap.get("eta").asInstanceOf[Double]) + params.setMaxDepth(paramsMap.get("max_depth").asInstanceOf[Int]) + params.setMaxBin(paramsMap.get("max_bin").asInstanceOf[Int]) + params.setNumIterations(paramsMap.get("num_round").asInstanceOf[Int]) + params.setNumTasks(paramsMap.get("num_tasks").asInstanceOf[Int]) + params.setMinGainToSplit(paramsMap.get("min_gain_to_split").asInstanceOf[Double]) + params.setLambdaL2(paramsMap.get("lambda_l2").asInstanceOf[Double]) + params.setNumLeaves(paramsMap.get("num_leaves").asInstanceOf[Int]) + params.setMinSumHessianInLeaf(paramsMap.get("min_child_weight").asInstanceOf[Double]) + params.setMinDataInLeaf(paramsMap.get("min_data_in_leaf").asInstanceOf[Int]) + params.setBaggingFraction(paramsMap.get("bagging").asInstanceOf[Double]) + params.setBaggingFreq(paramsMap.get("bagging_freq").asInstanceOf[Int]) + params.setNumThreads(paramsMap.get("num_threads").asInstanceOf[Int]) + params.setNetworkCompression(paramsMap.get("network_compression").asInstanceOf[Int]) + params.setHistSynchAlgo(paramsMap.get("hist_synch_algo").asInstanceOf[Int]) + params.setLoglossApx(paramsMap.get("logloss_apx").asInstanceOf[Int]) + params.setLoglossApxEps(paramsMap.get("logloss_apx_eps").asInstanceOf[Double]) + params.setLoadingBalance(paramsMap.get("loading_balance").asInstanceOf[String]) + params.setTrainingDataPath(trainingDataPath) + params.setTestDataPath(testDataPath) + params.setAlgorithmType(algorithmType) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("LightGBM") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${algorithmType}_${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${algorithmType}_${datasetName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${algorithmType}_${datasetName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder.config(conf).getOrCreate() + + val (res, costTime) = new LightGBMKernel().runJob(spark, params) + params.setEvaluation(res) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + val isCorrect = params.algorithmType match { + case "classification" => UpEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark) + case "regression" => DownEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark) + } + params.setIsCorrect(isCorrect) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s; evaluation: ${res};isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class LightGBMKernel{ + def runJob(spark: SparkSession, params: LightGBMParams): (Double, Double) = { + val sc = spark.sparkContext + sc.setLogLevel("INFO") + println(s"Initialized spark session.") + val t1 = System.currentTimeMillis() + + import spark.implicits._ + val trainData = spark.read.format("libsvm").option("vectorType", "sparse") + .load(params.trainingDataPath) + .repartition(params.numTasks) + .persist(StorageLevel.MEMORY_AND_DISK_SER) + val t2 = System.currentTimeMillis() + println("* after preprocess: " + t2) + + val lgbm = params.algorithmType match { + case "classification" =>{ + val classifier = new LightGBMClassifier() + .setObjective(params.objective) + .setLabelCol(params.labelCol) + .setFeaturesCol(params.featuresCol) + .setVerbosity(params.verbosity) + .setNumIterations(params.numIterations) + .setMaxDepth(params.maxDepth) + .setLearningRate(params.learningRate) + .setNumTasks(params.numTasks) + .setMaxBin(params.maxBin) + .setMinGainToSplit(params.minGainToSplit) + .setLambdaL2(params.lambdaL2) + .setNumLeaves(params.numLeaves) + .setMinDataInLeaf(params.minDataInLeaf) + .setMinSumHessianInLeaf(params.minSumHessianInLeaf) + .setBaggingFraction(params.baggingFraction) + .setBaggingFreq(params.baggingFreq) + classifier + } + case "regression" =>{ + val regressor = new LightGBMRegressor() + .setObjective(params.objective) + .setLabelCol(params.labelCol) + .setFeaturesCol(params.featuresCol) + .setVerbosity(params.verbosity) + .setNumIterations(params.numIterations) + .setMaxDepth(params.maxDepth) + .setLearningRate(params.learningRate) + .setNumTasks(params.numTasks) + .setMaxBin(params.maxBin) + .setMinGainToSplit(params.minGainToSplit) + .setLambdaL2(params.lambdaL2) + .setNumLeaves(params.numLeaves) + .setMinDataInLeaf(params.minDataInLeaf) + .setMinSumHessianInLeaf(params.minSumHessianInLeaf) + .setBaggingFraction(params.baggingFraction) + .setBaggingFreq(params.baggingFreq) + regressor + } + } + if(params.isRaw.equals("no")) { + lgbm.setAuxParams("num_threads", params.numThreads.toString) + lgbm.setAuxParams("network_compression", params.networkCompression.toString) + lgbm.setAuxParams("logloss_apx", params.loglossApx.toString) + lgbm.setAuxParams("logloss_apx_eps", params.loglossApxEps.toString) + lgbm.setAuxParams("loading_balance", params.loadingBalance) + } + val model = lgbm.fit(trainData) + val t3 = System.currentTimeMillis() + println("* after train: " + t3) + + val testData = spark.read.format("libsvm").option("vectorType", "sparse") + .load(params.testDataPath) + .persist(StorageLevel.MEMORY_AND_DISK_SER) + println(s"Test data read successful. Number of partitions - ${testData.rdd.getNumPartitions}") + val predictions = model.transform(testData) + val (res, t4) = params.algorithmType match { + case "classification" =>{ + val metrics = new ComputeModelStatistics() + .setLabelCol("label") + .setScoresCol("probability") + .setScoredLabelsCol("prediction") + .setEvaluationMetric(MetricConstants.AccuracySparkMetric) + .transform(predictions) + val ecc = metrics.collect().apply(0).apply(1).asInstanceOf[Double] + val t4 = System.currentTimeMillis() + (ecc, t4) + } + case "regression" =>{ + // compute model metrics + val metrics = new ComputeModelStatistics() + .setEvaluationMetric("regression") + .setLabelCol("label") + .setScoresCol("prediction") + .transform(predictions) + // print metrics + val mse = metrics.collect().apply(0).apply(0).asInstanceOf[Double] + val t4 = System.currentTimeMillis() + (mse, t4) + } + } + println("Model predictions:") + predictions.select("prediction", "label", "features").show(5) + val trainingProcess = (t3 - t1).toDouble / 1000 + val trainingStep = (t3 - t2).toDouble / 1000 + val dataProcess = (t2 - t1).toDouble / 1000 + val predict = (t4 - t3).toDouble / 1000 + println("[s]train total: " + trainingProcess) + println("[s]data preprocess: " + dataProcess) + println("[s]train: " + trainingStep) + println("[s]predict: " + predict) + + Utils.saveEvaluation(res, params.saveDataPath, sc) + (res, trainingProcess) + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/LinRRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/LinRRunner.scala new file mode 100644 index 0000000..33e5b58 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/LinRRunner.scala @@ -0,0 +1,236 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.DownEvaluationVerify + +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.ml.linalg.DenseMatrix +import org.apache.spark.ml.classification.LogisticRegression +import org.apache.spark.ml.param.{ParamMap, ParamPair} +import org.apache.spark.ml.regression.LinearRegression +import org.apache.spark.sql.functions.udf +import org.apache.hadoop.fs.{FileSystem, Path} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.io.{File, FileWriter} +import java.util +import scala.beans.BeanProperty + +class LinRConfig extends Serializable { + + @BeanProperty var linR: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class LinRParams extends Serializable { + + @BeanProperty var pt: Int = _ + @BeanProperty var numFeatures: Int = _ + @BeanProperty var loss: String = _ + @BeanProperty var regParam: Double = _ + @BeanProperty var elasticNetParam: Double = _ + @BeanProperty var maxIter: Int = _ + @BeanProperty var tolerance: Double = _ + + @BeanProperty var trainingDataPath: String = _ + @BeanProperty var testDataPath: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var evaluation: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var loadDataTime: Double = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object LinRRunner { + def main(args: Array[String]): Unit = { + + try { + val modelConfSplit = args(0).split("-") + val (datasetName, apiName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2), modelConfSplit(3)) + val dataPath = args(1) + val dataPathSplit = dataPath.split(",") + val (trainingDataPath, testDataPath) = (dataPathSplit(0), dataPathSplit(1)) + val cpuName = args(2) + val sparkConfSplit = args(3).split("_") + val (master, deployMode, numExec, execCores, execMem) = + (sparkConfSplit(0), sparkConfSplit(1), sparkConfSplit(2), sparkConfSplit(3), sparkConfSplit(4)) + val saveResultPath = args(4) + + val stream = Utils.getStream("conf/ml/linR/linR.yml") + val representer = new Representer + representer.addClassTag(classOf[LinRParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[LinRConfig]), representer, options) + val description = new TypeDescription(classOf[LinRParams]) + yaml.addTypeDescription(description) + val configs: LinRConfig = yaml.load(stream).asInstanceOf[LinRConfig] + val paramsMap: util.HashMap[String, Object] = configs.linR.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + val params = new LinRParams() + params.setPt(paramsMap.getOrDefault("pt", "2000").asInstanceOf[Int]) + params.setNumFeatures(paramsMap.getOrDefault("numFeatures", "500").asInstanceOf[Int]) + params.setLoss(paramsMap.getOrDefault("loss", "squaredError").asInstanceOf[String]) + params.setRegParam(paramsMap.getOrDefault("regParam", "0.01").asInstanceOf[Double]) + params.setElasticNetParam(paramsMap.getOrDefault("elasticNetParam", "0.0").asInstanceOf[Double]) + params.setMaxIter(paramsMap.getOrDefault("maxIter", "500").asInstanceOf[Int]) + params.setTolerance(paramsMap.getOrDefault("tolerance", "1E-6").asInstanceOf[Double]) + params.setTrainingDataPath(trainingDataPath) + params.setTestDataPath(testDataPath) + params.setApiName(apiName) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("LinR") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}_${apiName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${datasetName}_${apiName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName).setMaster(master) + val commonParas = Array ( + ("spark.submit.deployMode", deployMode), + ("spark.executor.instances", numExec), + ("spark.executor.cores", execCores), + ("spark.executor.memory", execMem) + ) + conf.setAll(commonParas) + val spark = SparkSession.builder.config(conf).getOrCreate() + + val (res, costTime) = new LinRKernel().runJob(spark, params) + params.setEvaluation(res) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(DownEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s; evaluation: ${res};isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class LinRKernel { + + def runJob(spark: SparkSession, params: LinRParams): (Double, Double) = { + + import spark.implicits._ + val sc = spark.sparkContext + val startTime = System.currentTimeMillis() + val trainData = spark + .read + .format("libsvm") + .option("vectorType", "dense") + .option("numFeatures", params.numFeatures) + .load(params.trainingDataPath) + .repartition(params.pt) + .cache() + + val trainingData = if (params.datasetName == "rcv") { + val label_convert = udf((x: Double) => if (x < 0.0) 0.0 else 1.0) + trainData.withColumn("label", label_convert($"label")) + } else { + trainData + } + println(s"trainingData: ${trainingData.count()}") + val loadDataTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setLoadDataTime(loadDataTime) + + val numIterationJY = 10 + val regParamJY = 0.01 + + val linR = new LinearRegression() + .setSolver("l-bfgs") + .setLoss(params.loss) + .setRegParam(regParamJY) + .setElasticNetParam(params.elasticNetParam) + .setMaxIter(numIterationJY) + .setTol(params.tolerance) + + if (params.apiName == "fit"){ + linR.setRegParam(params.regParam) + linR.setMaxIter(params.maxIter) + } + + val paramMap = ParamMap(linR.maxIter -> params.maxIter) + .put(linR.regParam, params.regParam) + val paramMaps: Array[ParamMap] = new Array[ParamMap](2) + for (i <- 0 to paramMaps.size -1) { + paramMaps(i) = ParamMap(linR.maxIter -> params.maxIter) + .put(linR.regParam, params.regParam) + } + val maxIterParamPair = ParamPair(linR.maxIter, params.maxIter) + val regParamPair = ParamPair(linR.regParam, params.regParam) + + val model = params.apiName match { + case "fit" => linR.fit(trainingData) + case "fit1" => linR.fit(trainingData, paramMap) + case "fit2" => + val models = linR.fit(trainingData, paramMaps) + models(0) + case "fit3" => linR.fit(trainingData, maxIterParamPair, regParamPair) + } + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setLoadDataTime(loadDataTime) + + val testData = spark + .read + .format("libsvm") + .option("vectorType", "dense") + .option("numFeatures", params.numFeatures) + .load(params.testDataPath) + .repartition(params.pt) + .cache() + + val testingData = if (params.datasetName == "rcv") { + val label_convert = udf((x: Double) => if (x < 0.0) 0.0 else 1.0) + testData.withColumn("label", label_convert($"label")) + } else { + testData + } + + val getSquaredError = udf((v1: Double, v2: Double) => { + math.pow((v1 - v2), 2) + }) + val predictions = model.transform(testingData) + .withColumn("squaredError", getSquaredError($"label", $"prediction")) + .select("squaredError").summary("mean") + + val res = predictions.select("squaredError").first().getString(0).toDouble + Utils.saveEvaluation(res, params.saveDataPath, sc) + (res, costTime) + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/LogRRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/LogRRunner.scala new file mode 100644 index 0000000..570181d --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/LogRRunner.scala @@ -0,0 +1,221 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.UpEvaluationVerify + +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.ml.linalg.DenseMatrix +import org.apache.spark.ml.classification.LogisticRegression +import org.apache.spark.ml.param.{ParamMap, ParamPair} +import org.apache.spark.sql.functions.udf +import org.apache.hadoop.fs.{FileSystem, Path} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.util +import java.io.{File, FileWriter} +import java.util.HashMap +import scala.beans.BeanProperty + +class LogRConfig extends Serializable { + @BeanProperty var logR: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class LogRParams extends Serializable { + + @BeanProperty var numPartitions: Int = _ + @BeanProperty var numLabels: Int = _ + @BeanProperty var numFeatures: Int = _ + @BeanProperty var regParam: Double = _ + @BeanProperty var elasticNetParam: Double = _ + @BeanProperty var maxIter: Int = _ + @BeanProperty var tolerance: Double = _ + @BeanProperty var isSetBound: Boolean = _ + @BeanProperty var coefficientLowerBound: Double = _ + @BeanProperty var coefficientUpperBound: Double = _ + + @BeanProperty var trainingDataPath: String = _ + @BeanProperty var testDataPath: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var evaluation: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var loadDataTime: Double = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object LogRRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, apiName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2), modelConfSplit(3)) + val dataPath = args(1) + val dataPathSplit = dataPath.split(",") + val (trainingDataPath, testDataPath) = (dataPathSplit(0), dataPathSplit(1)) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/logR/logR.yml") + val representer = new Representer + representer.addClassTag(classOf[LogRParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[LogRConfig]), representer, options) + val description = new TypeDescription(classOf[LogRParams]) + yaml.addTypeDescription(description) + val config: LogRConfig = yaml.load(stream).asInstanceOf[LogRConfig] + val paramsMap: util.HashMap[String, Object] = config.logR.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + val params = new LogRParams() + params.setNumPartitions(paramsMap.get("numPartitions").asInstanceOf[Int]) + params.setMaxIter(paramsMap.get("maxIter").asInstanceOf[Int]) + params.setNumLabels(paramsMap.get("numLabels").asInstanceOf[Int]) + params.setNumFeatures(paramsMap.get("numFeatures").asInstanceOf[Int]) + params.setRegParam(paramsMap.get("regParam").asInstanceOf[Double]) + params.setElasticNetParam(paramsMap.get("elasticNetParam").asInstanceOf[Double]) + params.setTolerance(paramsMap.get("tolerance").asInstanceOf[Double]) + params.setIsSetBound(paramsMap.get("isSetBound").asInstanceOf[Boolean]) + params.setCoefficientLowerBound(paramsMap.get("coefficientLowerBound").asInstanceOf[Double]) + params.setCoefficientUpperBound(paramsMap.get("coefficientUpperBound").asInstanceOf[Double]) + params.setApiName(apiName) + params.setTrainingDataPath(trainingDataPath) + params.setTestDataPath(testDataPath) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("LogR") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}_${apiName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${datasetName}_${apiName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder.config(conf).getOrCreate() + val (res, costTime) = new LogRKernel().runJob(spark, params) + params.setEvaluation(res) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(UpEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s; evaluation: ${res};isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class LogRKernel { + + def runJob(spark: SparkSession, params: LogRParams): (Double, Double) = { + + import spark.implicits._ + val sc = spark.sparkContext + val startTime = System.currentTimeMillis() + val trainData = spark + .read + .format("libsvm") + .option("vectorType", "dense") + .option("numFeatures", params.numFeatures) + .load(params.trainingDataPath) + .repartition(params.numPartitions) + .cache() + + val trainingData = if (params.datasetName == "rcv") { + val label_convert = udf((x: Double) => if (x < 0.0) 0.0 else 1.0) + trainData.withColumn("label", label_convert($"label")) + } else { + trainData + } + println(s"trainingData: ${trainingData.count()}") + val loadDataTime = (System.currentTimeMillis() - startTime) / 1000.0 + + val logR = if (params.isSetBound){ + new LogisticRegression() + .setRegParam(params.regParam) + .setElasticNetParam(params.elasticNetParam) + .setMaxIter(params.maxIter) + .setTol(params.tolerance) + .setLowerBoundsOnCoefficients(new DenseMatrix(params.numLabels, params.numFeatures, Array.fill(params.numLabels * params.numFeatures)(params.coefficientLowerBound), true)) + .setUpperBoundsOnCoefficients(new DenseMatrix(params.numLabels, params.numFeatures, Array.fill(params.numLabels * params.numFeatures)(params.coefficientUpperBound), true)) + } + else { + new LogisticRegression() + .setRegParam(params.regParam) + .setElasticNetParam(params.elasticNetParam) + .setMaxIter(params.maxIter) + .setTol(params.tolerance) + } + + val paramMap = ParamMap(logR.maxIter -> params.maxIter) + .put(logR.regParam, params.regParam) + val paramMaps: Array[ParamMap] = new Array[ParamMap](2) + for (i <- 0 to paramMaps.size -1) { + paramMaps(i) = ParamMap(logR.maxIter -> params.maxIter) + .put(logR.regParam, params.regParam) + } + val maxIterParamPair = ParamPair(logR.maxIter, params.maxIter) + val regParamPair = ParamPair(logR.regParam, params.regParam) + val model = params.apiName match { + case "fit" => logR.fit(trainingData) + case "fit1" => logR.fit(trainingData, paramMap) + case "fit2" => + val models = logR.fit(trainingData, paramMaps) + models(0) + case "fit3" => logR.fit(trainingData, maxIterParamPair, regParamPair) + } + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setLoadDataTime(loadDataTime) + + val testData = spark + .read + .format("libsvm") + .option("vectorType", "dense") + .option("numFeatures", params.numFeatures) + .load(params.testDataPath) + .repartition(params.numPartitions) + .cache() + + val testingData = if (params.datasetName == "rcv") { + val label_convert = udf((x: Double) => if (x < 0.0) 0.0 else 1.0) + testData.withColumn("label", label_convert($"label")) + } else { + testData + } + + val predictions = model.transform(testingData) + val res = predictions.filter($"label" === $"prediction").count().toDouble / predictions.count() + Utils.saveEvaluation(res, params.saveDataPath, sc) + (res, costTime) + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/NMFRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/NMFRunner.scala new file mode 100644 index 0000000..9f16c2c --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/NMFRunner.scala @@ -0,0 +1,225 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.DownEvaluationVerify + +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.apache.spark.internal.Logging +import org.apache.spark.SparkConf +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.ml.recommendation.{ALS, NMF} +import org.apache.spark.ml.recommendation.ALS.{Rating => ALSRating} +import org.apache.spark.ml.recommendation.NMF.{Rating => NMFRating} +import org.apache.spark.ml.evaluation.RegressionEvaluator + +import java.io.{File, FileWriter, PrintWriter} +import java.util +import java.util.Date +import scala.beans.BeanProperty +import scala.collection.mutable.ArrayBuffer + +class NMFConfig extends Serializable { + @BeanProperty var nmf: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class NMFParams extends Serializable { + @BeanProperty var userCol: String = _ + @BeanProperty var itemCol: String = _ + @BeanProperty var ratingCol: String = _ + @BeanProperty var pt: Int = _ + @BeanProperty var rank: Int = _ + @BeanProperty var maxIter: Int = _ + + + @BeanProperty var dataPath: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var standSilhouette: Double = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var startTime: Long = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var evaluation: Double = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object NMFRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + val dataPath = args(1) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/nmf/nmf.yml") + val representer = new Representer + representer.addClassTag(classOf[NMFParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[NMFConfig]), representer, options) + val description = new TypeDescription(classOf[NMFParams]) + yaml.addTypeDescription(description) + val configs: NMFConfig = yaml.load(stream).asInstanceOf[NMFConfig] + val paramsMap: util.HashMap[String, Object] = configs.nmf.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + val params = new NMFParams() + params.setUserCol(paramsMap.get("userCol").asInstanceOf[String]) + params.setItemCol(paramsMap.get("itemCol").asInstanceOf[String]) + params.setRatingCol(paramsMap.get("ratingCol").asInstanceOf[String]) + params.setPt(paramsMap.get("pt").asInstanceOf[Int]) + params.setRank(paramsMap.get("rank").asInstanceOf[Int]) + params.setMaxIter(paramsMap.get("maxIter").asInstanceOf[Int]) + params.setDataPath(dataPath) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("NMF") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${datasetName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder.config(conf).getOrCreate() + spark.sparkContext.setLogLevel("ERROR") + val (res, costTime) = new NMFKernel().runJob(spark, params) + params.setEvaluation(res) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(DownEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s; evaluation: ${res};isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class NMFKernel { + def runJob(spark: SparkSession, params: NMFParams): (Double, Double) = { + val sc = spark.sparkContext + val startTime = System.currentTimeMillis() + + val ratings = params.datasetName match { + case "HibenchRating50mx10mx500m" => readHibench(spark, params.dataPath, params.pt, params.isRaw) + case "CSJ" | "MT" | "Books" => readCSJAndMTAndBooks(spark, params.dataPath, params.pt, params.isRaw) + } + val Array(training, test) = ratings.randomSplit(Array(0.8, 0.2), 0L) + training.rdd.setName(s"$training").cache().count() + test.rdd.setName(s"$test").cache().count() + + val model = params.isRaw match { + case "no" =>{ + val nmf = new NMF() + .setUserCol(params.userCol) + .setItemCol(params.itemCol) + .setRatingCol(params.ratingCol) + .setNumBlocks(params.pt) + .setMaxIter(params.maxIter) + .setRank(params.rank) + val model = nmf.fit(training) + model.userFactors.cache().foreach(_ => {}) + model.itemFactors.cache().foreach(_ => {}) + model.setColdStartStrategy("drop") + model + } + case "yes" =>{ + val alsNN = new ALS() + .setNonnegative(true) + .setUserCol(params.userCol) + .setItemCol(params.itemCol) + .setRatingCol(params.ratingCol) + .setNumBlocks(params.pt) + .setMaxIter(params.maxIter) + .setRank(params.rank) + val model = alsNN.fit(training) + model.userFactors.cache().foreach(_ => {}) + model.itemFactors.cache().foreach(_ => {}) + model.setColdStartStrategy("drop") + model + } + } + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + val evaluator = new RegressionEvaluator() + .setMetricName("rmse") + .setLabelCol("rating") + .setPredictionCol("prediction") + val predictions = model.transform(test) + val rmse = evaluator.evaluate(predictions) + Utils.saveEvaluation(rmse, params.saveDataPath, sc) + (rmse, costTime) + } + + def readCSJAndMTAndBooks( + spark: SparkSession, + dataPath: String, + pt: Int = 250, + isRaw: String, + sep: String = ","): DataFrame = { + import spark.implicits._ + val ratings = spark.sparkContext + .textFile(dataPath, pt) + .map(s => { + s.split(sep) + }) + val users = ratings.map(s => s(0)).distinct().zipWithIndex() + val items = ratings.map(s => s(1)).distinct().zipWithIndex() + if (isRaw.equals("no")) { + ratings.map(s => (s(0), s.slice(1, 4))) + .join(users).map { case (userName, (arr, userId)) => (arr(0), (arr.slice(1, 3), userId)) } + .join(items).map { case (itemName, ((arr, userId), itemId)) => + NMFRating(userId, itemId, arr(0).toFloat) + }.toDF() + } else { + ratings.map(s => (s(0), s.slice(1, 4))) + .join(users).map { case (userName, (arr, userId)) => (arr(0), (arr.slice(1, 3), userId)) } + .join(items).map { case (itemName, ((arr, userId), itemId)) => + ALSRating(userId, itemId, arr(0).toFloat) + }.toDF() + } + } + + def readHibench( + spark: SparkSession, + dataPath: String, + pt: Int = 250, + isRaw: String): DataFrame = { + import spark.implicits._ + if(isRaw.equals("no")) { + spark.sparkContext.objectFile[ALSRating[Int]](dataPath, pt) + .map(x => NMFRating(x.user, x.item, x.rating)).toDF() + } else { + spark.sparkContext.objectFile[ALSRating[Int]](dataPath, pt).toDF() + } + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/PCARunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/PCARunner.scala new file mode 100644 index 0000000..683f300 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/PCARunner.scala @@ -0,0 +1,232 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.MatrixVerify + +import org.apache.spark.ml.feature.{PCA => MLPCA} +import org.apache.spark.ml.linalg.{Vectors => MLVectors} +import org.apache.spark.ml.linalg.SQLDataTypes.VectorType +import org.apache.spark.ml.param.{ParamMap, ParamPair} +import org.apache.spark.mllib.feature.{PCA => MLibPCA} +import org.apache.spark.mllib.linalg.{DenseMatrix, Vectors => MLibVectors} +import org.apache.spark.SparkConf +import org.apache.spark.sql.Row +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.types.{StringType, StructField, StructType} +import org.apache.spark.sql.functions.udf +import org.apache.spark.storage.StorageLevel +import org.apache.hadoop.fs.{FileSystem, Path} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.util +import java.io.{File, FileWriter, PrintWriter} +import java.util.HashMap +import scala.beans.BeanProperty + +class PCAConfig extends Serializable { + + @BeanProperty var pca: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class PCAParams extends Serializable { + + @BeanProperty var numPartitions: Int = _ + @BeanProperty var k: Int = _ + @BeanProperty var numCols: Int = _ + @BeanProperty var numRows: Int = _ + @BeanProperty var dataFormat: String = _ + @BeanProperty var sep: String = _ + + @BeanProperty var dataPath: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var dataStructure: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var loadDataTime: Double = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object PCARunner { + + def main(args: Array[String]): Unit = { + + try { + val modelConfSplit = args(0).split("-") + val (dataStructure, datasetName, apiName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2), modelConfSplit(3), modelConfSplit(4)) + val dataPath = args(1) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/pca/pca.yml") + val representer = new Representer + representer.addClassTag(classOf[PCAParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[PCAConfig]), representer, options) + val description = new TypeDescription(classOf[PCAParams]) + yaml.addTypeDescription(description) + val config: PCAConfig = yaml.load(stream).asInstanceOf[PCAConfig] + val paramsMap: util.HashMap[String, Object] = config.pca.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + val params = new PCAParams() + params.setNumPartitions(paramsMap.get("numPartitions").asInstanceOf[Int]) + params.setK(paramsMap.get("k").asInstanceOf[Int]) + params.setNumCols(paramsMap.get("numCols").asInstanceOf[Int]) + params.setNumRows(paramsMap.get("numRows").asInstanceOf[Int]) + params.setDataFormat(paramsMap.get("dataFormat").asInstanceOf[String]) + params.setSep(paramsMap.get("sep").asInstanceOf[String]) + params.setApiName(apiName) + params.setDataPath(dataPath) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setDataStructure(dataStructure) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("PCA") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${dataStructure}_${datasetName}_${apiName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${dataStructure}_${datasetName}_${apiName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${dataStructure}_${datasetName}_${apiName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder.config(conf).getOrCreate() + val costTime = dataStructure match { + case "dataframe" => new PCAKernel().runDataFrameJob(spark, params) + case "rdd" => new PCAKernel().runRDDJob(spark, params) + } + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(MatrixVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s;isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class PCAKernel { + + def runDataFrameJob(spark: SparkSession, params: PCAParams): Double = { + + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val numColsBC = sc.broadcast(params.numCols) + val sepBC = sc.broadcast(params.sep) + + val startTime = System.currentTimeMillis() + + val data = if (params.dataFormat == "coo") { + spark.createDataFrame(sc.textFile(params.dataPath, params.numPartitions) + .map(line => { + val entry = line.split(sepBC.value) + (entry(0).toInt, (entry(1).toInt, entry(2).toDouble)) + }).groupByKey() + .map { case (_, vectorEntries) => MLVectors.sparse(numColsBC.value, vectorEntries.toSeq) } + .repartition(params.numPartitions) + .map(Tuple1.apply)) + .toDF("matrix").persist(StorageLevel.MEMORY_ONLY) + } else { + spark.createDataFrame(sc.textFile(params.dataPath) + .map(row => Row(MLVectors.dense(row.split(sepBC.value).map(_.toDouble)))) + .repartition(params.numPartitions), + StructType(List(StructField("matrix", VectorType))) + ).persist(StorageLevel.MEMORY_ONLY) + } + + println("count: " + data.count()) + val loadDataTime = (System.currentTimeMillis() - startTime) / 1000.0 + + val pca = new MLPCA().setK(params.k).setInputCol("matrix") + + val paramMap = ParamMap(pca.k -> params.k) + .put(pca.inputCol, "matrix") + val paramMaps: Array[ParamMap] = new Array[ParamMap](2) + for (i <- 0 to paramMaps.size - 1) { + paramMaps(i) = ParamMap(pca.k -> params.k) + .put(pca.inputCol, "matrix") + } + val kPair = ParamPair(pca.k, params.k) + val inputColPair = ParamPair(pca.inputCol, "matrix") + val model = params.apiName match { + case "fit" => pca.fit(data) + case "fit1" => pca.fit(data, paramMap) + case "fit2" => + val models = pca.fit(data, paramMaps) + models(0) + case "fit3" => pca.fit(data, kPair, inputColPair) + } + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setLoadDataTime(loadDataTime) + val pcaMat = new DenseMatrix(model.pc.numRows, model.pc.numCols, model.pc.values, model.pc.isTransposed) + MatrixVerify.saveMatrix(pcaMat, params.saveDataPath, sc) + costTime + } + + def runRDDJob(spark: SparkSession, params: PCAParams): Double = { + + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val numColsBC = sc.broadcast(params.numCols) + val sepBC = sc.broadcast(params.sep) + + val startTime = System.currentTimeMillis() + + val data = if (params.dataFormat == "coo") { + sc.textFile(params.dataPath, params.numPartitions) + .map(line => { + val entry = line.split(sepBC.value) + (entry(0).toInt, (entry(1).toInt, entry(2).toDouble)) + }).groupByKey() + .map { case (_, vectorEntries) => MLibVectors.sparse(numColsBC.value, vectorEntries.toSeq) } + .repartition(params.numPartitions).persist(StorageLevel.MEMORY_ONLY) + } else { + sc.textFile(params.dataPath) + .map(row => MLibVectors.dense(row.split(sepBC.value).map(_.toDouble))) + .repartition(params.numPartitions).persist(StorageLevel.MEMORY_ONLY) + } + + println("count: " + data.count()) + val loadDataTime = (System.currentTimeMillis() - startTime) / 1000.0 + + val model = new MLibPCA(params.k).fit(data) + + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setLoadDataTime(loadDataTime) + + val pcaMat = model.pc + MatrixVerify.saveMatrix(pcaMat, params.saveDataPath, sc) + + costTime + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/PearsonRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/PearsonRunner.scala new file mode 100644 index 0000000..31b8685 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/PearsonRunner.scala @@ -0,0 +1,183 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.MatrixVerify + +import org.apache.spark.ml.stat +import org.apache.spark.ml.linalg.SQLDataTypes.VectorType +import org.apache.spark.ml.linalg.{Matrix, Vectors, Vector} +import org.apache.spark.mllib.linalg.{Vectors => OldVectors} +import org.apache.spark.mllib.linalg.DenseMatrix +import org.apache.spark.mllib.stat.{Statistics => OldStatistics} +import org.apache.spark.sql.{DataFrame, Row, SparkSession} +import org.apache.spark.SparkConf +import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.storage.StorageLevel +import org.apache.hadoop.fs.{FileSystem, Path} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.io.{File, FileWriter} +import java.util +import scala.beans.BeanProperty + +class PearsonConfig extends Serializable { + @BeanProperty var pearson: util.HashMap[String, util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]]] = _ +} + +class PearsonParams extends Serializable { + @BeanProperty var pt: Int = _ + + @BeanProperty var trainingDataPath: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object PearsonRunner { + + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (dataStructure, datasetName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2), modelConfSplit(3)) + val trainingDataPath = args(1) + val cpuName = args(2) + val sparkConfSplit = args(3).split("_") + val (master, deployMode, numExec, execCores, execMem) = + (sparkConfSplit(0), sparkConfSplit(1), sparkConfSplit(2), sparkConfSplit(3), sparkConfSplit(4)) + val saveResultPath = args(4) + + val stream = Utils.getStream("conf/ml/pearson/pearson.yml") + val representer = new Representer + representer.addClassTag(classOf[PearsonParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[PearsonConfig]), representer, options) + val description = new TypeDescription(classOf[PearsonParams]) + yaml.addTypeDescription(description) + val configs: PearsonConfig = yaml.load(stream).asInstanceOf[PearsonConfig] + val paramsMap: util.HashMap[String, Object] = configs.pearson.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(dataStructure).get(datasetName) + val params = new PearsonParams() + params.setPt(paramsMap.getOrDefault("pt", "1000").asInstanceOf[Int]) + params.setTrainingDataPath(trainingDataPath) + params.setApiName(dataStructure) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("Pearson") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${dataStructure}_${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${dataStructure}_${datasetName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${dataStructure}_${datasetName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName).setMaster(master) + val commonParas = Array ( + ("spark.submit.deployMode", deployMode), + ("spark.executor.instances", numExec), + ("spark.executor.cores", execCores), + ("spark.executor.memory", execMem) + ) + conf.setAll(commonParas) + val spark = SparkSession.builder.config(conf).getOrCreate() + println(s"Initialized spark session.") + + val costTime = dataStructure match { + case "dataframe" => + new PearsonKernel().runDataframeJob(spark, params) + case "rdd" => + new PearsonKernel().runRddJob(spark, params) + } + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(MatrixVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s;isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class PearsonKernel { + + def runDataframeJob(spark: SparkSession, params: PearsonParams): Double = { + val pt = params.pt + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val startTime = System.currentTimeMillis() + val data = spark.createDataFrame( + sc + .textFile(params.trainingDataPath) + .map(x=>Row(Vectors.dense(x.split(",").map(_.toDouble)))) + .repartition(pt), + StructType(List(StructField("matrix", VectorType))) + ).persist(StorageLevel.MEMORY_ONLY) + + val result = stat.Correlation.corr(data, "matrix") + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + val mat = result.collect()(0).getAs[Matrix](0) + val pearsonMat = new DenseMatrix(mat.numRows, mat.numCols, mat.toArray, mat.isTransposed) + //save result + MatrixVerify.saveMatrix(pearsonMat, params.saveDataPath, sc) + costTime + } + + def runRddJob(spark: SparkSession, params: PearsonParams): Double = { + + val pt = params.pt + val sc = spark.sparkContext + val startTime = System.currentTimeMillis() + val data = spark.createDataFrame( + sc.textFile(params.trainingDataPath) + .map(x=>Row(Vectors.dense(x.split(",").map(_.toDouble)))) + .repartition(pt), + StructType(List(StructField("matrix", VectorType))) + ).persist(StorageLevel.MEMORY_ONLY) + + val rdd = data.select("matrix").rdd.map{ + case Row(v: Vector) => OldVectors.fromML(v) + } + + val oldM = OldStatistics.corr(rdd, "pearson") + val pearsonMat = oldM.asInstanceOf[DenseMatrix] + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + //save result + MatrixVerify.saveMatrix(pearsonMat, params.saveDataPath, sc) + + costTime + } + +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/PrefixSpanRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/PrefixSpanRunner.scala new file mode 100644 index 0000000..f28b118 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/PrefixSpanRunner.scala @@ -0,0 +1,190 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.PrefixSpanVerify + +import io.airlift.compress.lz4.Lz4Codec +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.hadoop.io.NullWritable +import org.apache.spark.SparkConf +import org.apache.spark.mllib.fpm.PrefixSpan +import org.apache.spark.sql.SparkSession +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.util +import java.io.{File, FileWriter} +import java.util.HashMap +import scala.beans.BeanProperty + + +class PrefixSpanConfig extends Serializable { + + @BeanProperty var ps: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class PrefixSpanParams extends Serializable { + + @BeanProperty var numPartitions: Int = _ + @BeanProperty var minSupport: Double = _ + @BeanProperty var maxPatternLength: Int = _ + @BeanProperty var maxLocalProjDBSize: Int = _ + @BeanProperty var localTimeout: String = _ + @BeanProperty var filterCandidates: String = _ + @BeanProperty var projDBstep: String = _ + @BeanProperty var redistributeData: Boolean = _ + + @BeanProperty var dataPath: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var loadDataTime: Double = _ + @BeanProperty var numFreqSeqs: Long = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object PrefixSpanRunner { + + def main(args: Array[String]): Unit = { + + try { + val modelConfSplit = args(0).split("-") + val (datasetName , isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + val dataPath = args(1) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/ps/ps.yml") + val representer = new Representer + representer.addClassTag(classOf[PrefixSpanParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[PrefixSpanConfig]), representer, options) + val description = new TypeDescription(classOf[PrefixSpanParams]) + yaml.addTypeDescription(description) + val config: PrefixSpanConfig = yaml.load(stream).asInstanceOf[PrefixSpanConfig] + val params = new PrefixSpanParams() + val paramsMap = config.ps.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + params.setNumPartitions(paramsMap.get("numPartitions").asInstanceOf[Int]) + params.setMinSupport(paramsMap.get("minSupport").asInstanceOf[Double]) + params.setMaxPatternLength(paramsMap.get("maxPatternLength").asInstanceOf[Int]) + params.setMaxLocalProjDBSize(paramsMap.get("maxLocalProjDBSize").asInstanceOf[Int]) + params.setLocalTimeout(paramsMap.get("localTimeout").asInstanceOf[String]) + params.setFilterCandidates(paramsMap.get("filterCandidates").asInstanceOf[String]) + params.setProjDBstep(paramsMap.get("projDBstep").asInstanceOf[String]) + params.setRedistributeData(paramsMap.get("redistributeData").asInstanceOf[Boolean]) + params.setDataPath(dataPath) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("PrefixSpan") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${datasetName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + if (isRaw.equals("no")) { + conf.set("spark.boostkit.ml.ps.localTimeout", params.localTimeout) + conf.set("spark.boostkit.ml.ps.filterCandidates", params.filterCandidates) + conf.set("spark.boostkit.ml.ps.projDBstep", params.projDBstep) + } + val spark = SparkSession.builder.config(conf).getOrCreate() + + val costTime = new PrefixSpanKernel().runJob(spark, params) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(PrefixSpanVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${params.costTime}s; isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } + +class PrefixSpanKernel { + + def runJob(spark: SparkSession, params: PrefixSpanParams): Double = { + + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + + val startTime = System.currentTimeMillis() + val oriSequences = sc.textFile(params.dataPath).map{line => + var trans = Array.empty[Array[Int]] + var items = Array.empty[Int] + line.split(" ").foreach{itemStr => + val item = itemStr.toInt + if (item >= 0) { + items :+= item + } else if (-1 == item) { + trans :+= items + items = Array.empty + } else if (-2 == item) { + assert(items.isEmpty) + } + } + trans + } + + val sequences = if (params.redistributeData) { + sc.parallelize(oriSequences.collect(), params.numPartitions).cache() + } else { + oriSequences.repartition(params.numPartitions).cache() + } + println(s"Data count: ${sequences.count()}") + val loadDataTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setLoadDataTime(loadDataTime) + val prefixSpan = new PrefixSpan() + .setMinSupport(params.minSupport) + .setMaxPatternLength(params.maxPatternLength) + .setMaxLocalProjDBSize(params.maxLocalProjDBSize) + val model = prefixSpan.run(sequences) + + val newPatterns = model.freqSequences.map{freqSequence => + val freqSeqs = freqSequence.sequence.map{v => + v.sorted.mkString("(", ",", ")") + }.mkString("[", ",", "]") + s"$freqSeqs,${freqSequence.freq}" + }.persist() + + val numFreqSeqs = newPatterns.count() + params.setNumFreqSeqs(numFreqSeqs) + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + PrefixSpanVerify.saveRes(newPatterns, params.saveDataPath, sc) + costTime + } + } + +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/RFRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/RFRunner.scala new file mode 100644 index 0000000..ac758be --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/RFRunner.scala @@ -0,0 +1,405 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.UpEvaluationVerify +import com.bigdata.compare.ml.DownEvaluationVerify + +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.ml.Pipeline +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.ml.classification.RandomForestClassifier +import org.apache.spark.ml.evaluation.{MulticlassClassificationEvaluator, RegressionEvaluator} +import org.apache.spark.ml.feature.{IndexToString, StringIndexer, VectorIndexer} +import org.apache.spark.ml.param.{ParamMap, ParamPair} +import org.apache.spark.ml.regression.RandomForestRegressor +import org.apache.spark.mllib.regression.LabeledPoint +import org.apache.spark.mllib.tree.RandomForest +import org.apache.spark.mllib.tree.configuration.{Algo, Strategy} +import org.apache.spark.mllib.tree.impurity.{Gini, Variance} +import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.storage.StorageLevel + +import java.io.{File, FileWriter, PrintWriter} +import java.nio.file.{Paths, Files} +import java.util +import scala.beans.BeanProperty +import scala.io.Source + +class RFConfig extends Serializable { + @BeanProperty var rf: util.HashMap[String, util.HashMap[String, util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]]]] = _ +} + +class RFParams extends Serializable { + @BeanProperty var genericPt: Int = _ + @BeanProperty var maxMemoryInMB: Int = _ + @BeanProperty var pt: Int = _ + @BeanProperty var numCopiesInput: Int = _ + @BeanProperty var numTrees: Int = _ + @BeanProperty var maxDepth: Int = _ + @BeanProperty var maxBins: Int = _ + @BeanProperty var useNodeIdCache: Boolean = _ + @BeanProperty var checkpointInterval: Int = _ + @BeanProperty var numClasses: Int = _ + @BeanProperty var bcVariables: Boolean = _ + @BeanProperty var featureSubsetStrategy: String = _ + @BeanProperty var featuresType: String = _ + + @BeanProperty var trainingDataPath: String = _ + @BeanProperty var testDataPath: String = _ + @BeanProperty var algorithmType: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var evaluation: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object RFRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (algorithmType, dataStructure, datasetName, apiName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2), modelConfSplit(3), modelConfSplit(4), modelConfSplit(5)) + val dataPath = args(1) + val dataPathSplit = dataPath.split(",") + val (trainingDataPath, testDataPath) = (dataPathSplit(0), dataPathSplit(1)) + val cpuName = args(2) + val sparkConfSplit = args(3).split("_") + val (master, deployMode, numExec, execCores, execMem) = + (sparkConfSplit(0), sparkConfSplit(1), sparkConfSplit(2), sparkConfSplit(3), sparkConfSplit(4)) + val saveResultPath = args(4) + + val stream = Utils.getStream("conf/ml/rf/rf.yml") + val representer = new Representer + representer.addClassTag(classOf[RFParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[RFConfig]), representer, options) + val description = new TypeDescription(classOf[RFParams]) + yaml.addTypeDescription(description) + val configs: RFConfig = yaml.load(stream).asInstanceOf[RFConfig] + val params = new RFParams() + val paramsMap: util.HashMap[String, Object] = configs.rf.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(algorithmType).get(dataStructure).get(datasetName) + params.setGenericPt(paramsMap.getOrDefault("genericPt", "1000").asInstanceOf[Int]) + params.setMaxMemoryInMB(paramsMap.getOrDefault("maxMemoryInMB", "256").asInstanceOf[Int]) + params.setPt(paramsMap.getOrDefault("pt", "1000").asInstanceOf[Int]) + params.setNumCopiesInput(paramsMap.getOrDefault("numCopiesInput", "1").asInstanceOf[Int]) + params.setNumTrees(paramsMap.getOrDefault("numTrees", "20").asInstanceOf[Int]) + params.setMaxDepth(paramsMap.getOrDefault("maxDepth", "5").asInstanceOf[Int]) + params.setMaxBins(paramsMap.getOrDefault("maxBins", "32").asInstanceOf[Int]) + params.setNumClasses(paramsMap.get("numClasses").asInstanceOf[Int]) + params.setUseNodeIdCache(paramsMap.getOrDefault("useNodeIdCache", "false").asInstanceOf[Boolean]) + params.setCheckpointInterval(paramsMap.getOrDefault("checkpointInterval", "10").asInstanceOf[Int]) + params.setFeatureSubsetStrategy(paramsMap.getOrDefault("featureSubsetStrategy", "auto").asInstanceOf[String]) + params.setFeaturesType(paramsMap.getOrDefault("featuresType", "array").asInstanceOf[String]) + params.setBcVariables(paramsMap.getOrDefault("bcVariables", "false").asInstanceOf[Boolean]) + params.setTrainingDataPath(trainingDataPath) + params.setTestDataPath(testDataPath) + params.setAlgorithmType(algorithmType) + params.setApiName(apiName) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("RF") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${algorithmType}_${datasetName}_${dataStructure}_${apiName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${algorithmType}_${datasetName}_${dataStructure}_${apiName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${algorithmType}_${datasetName}_${dataStructure}_${apiName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + if (apiName != "fit") { + params.setNumTrees(5) + params.setMaxDepth(3) + } + + val conf = new SparkConf().setAppName(appName).setMaster(master) + val commonParas = Array ( + ("spark.submit.deployMode", deployMode), + ("spark.executor.instances", numExec), + ("spark.executor.cores", execCores), + ("spark.executor.memory", execMem) + ) + conf.setAll(commonParas) + if (isRaw.equals("no")) { + conf.set("spark.boostkit.ml.rf.binnedFeaturesDataType", + paramsMap.get("featuresType").asInstanceOf[String]) + conf.set("spark.boostkit.ml.rf.numTrainingDataCopies", + paramsMap.get("numCopiesInput").asInstanceOf[Int].toString) + conf.set("spark.boostkit.ml.rf.numPartsPerTrainingDataCopy", + paramsMap.get("pt").asInstanceOf[Int].toString) + conf.set("spark.boostkit.ml.rf.broadcastVariables", + paramsMap.get("bcVariables").asInstanceOf[Boolean].toString) + } + val spark = SparkSession.builder.config(conf).getOrCreate() + + val (res, costTime) = dataStructure match { + case "dataframe" => new RFKernel().rfDataframeJob(spark, params) + case "rdd" => new RFKernel().rfRDDJob(spark, params) + } + params.setEvaluation(res) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + val isCorrect = params.algorithmType match { + case "classification" => UpEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark) + case "regression" => DownEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark) + } + params.setIsCorrect(isCorrect) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s; evaluation: ${res};isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class RFKernel { + def rfDataframeJob(spark: SparkSession, params: RFParams): (Double, Double) = { + val sc = spark.sparkContext + val pt = params.pt + val trainingDataPath = params.trainingDataPath + val testDataPath = params.testDataPath + val numTrees = params.numTrees + val maxDepth = params.maxDepth + val maxBins = params.maxBins + val useNodeIdCache = params.useNodeIdCache + val checkpointInterval = params.checkpointInterval + val maxMemoryInMB = params.maxMemoryInMB + val featureSubsetStrategy = params.featureSubsetStrategy + val genericPt = params.genericPt + + println(s"Initialized spark session.") + val startTime = System.currentTimeMillis() + + val reader = spark.read.format("libsvm") + if (params.datasetName == "mnist8m") { + reader.option("numFeatures",784) + } else if (params.datasetName == "higgs") { + reader.option("numFeatures",28) + } else if (params.datasetName == "epsilon") { + reader.option("numFeatures", 2000) + } else if (params.datasetName == "rcv") { + reader.option("numFeatures", 47236) + } + + val numPtTrainData = if ("no" == params.isRaw) genericPt else pt + val trainingData = reader + .load(trainingDataPath) + .repartition(numPtTrainData) + .persist(StorageLevel.MEMORY_AND_DISK_SER) + + val labelIndexer = new StringIndexer() + .setInputCol("label") + .setOutputCol("indexedLabel") + .fit(trainingData) + + val labelConverter = new IndexToString() + .setInputCol("prediction") + .setOutputCol("predictedLabel") + .setLabels(labelIndexer.labels) + + // Train a RandomForest model + val rf = params.algorithmType match { + case "classification" =>{ + val oldRf = new RandomForestClassifier() + .setLabelCol("indexedLabel") + .setFeaturesCol("features") + .setNumTrees(numTrees) + .setMaxDepth(maxDepth) + .setMaxBins(maxBins) + .setCacheNodeIds(useNodeIdCache) + .setCheckpointInterval(checkpointInterval) + .setMaxMemoryInMB(maxMemoryInMB) + if (featureSubsetStrategy.nonEmpty) + oldRf.setFeatureSubsetStrategy(featureSubsetStrategy) + oldRf + } + case "regression" =>{ + val oldRf = new RandomForestRegressor() + .setLabelCol("indexedLabel") + .setFeaturesCol("features") + .setNumTrees(numTrees) + .setMaxDepth(maxDepth) + .setMaxBins(maxBins) + .setCacheNodeIds(useNodeIdCache) + .setCheckpointInterval(checkpointInterval) + .setMaxMemoryInMB(maxMemoryInMB) + if (featureSubsetStrategy.nonEmpty) + oldRf.setFeatureSubsetStrategy(featureSubsetStrategy) + oldRf + } + } + + val pipeline = new Pipeline() + .setStages(Array(labelIndexer, rf, labelConverter)) + + val paramMap = ParamMap(rf.maxDepth -> maxDepth) + .put(rf.numTrees, numTrees) + val firstParamPair = ParamPair(rf.maxDepth, maxDepth) + val otherParamPairs_1st = ParamPair(rf.maxDepth, maxDepth) + val otherParamPairs_2nd = ParamPair(rf.numTrees, numTrees) + val paramMaps = new Array[ParamMap](2) + for (i <- 0 until paramMaps.size){ + paramMaps(i) = ParamMap(rf.maxDepth -> maxDepth) + .put(rf.numTrees, numTrees) + } + + val model = params.apiName match { + case "fit" => pipeline.fit(trainingData) + case "fit1" => pipeline.fit(trainingData, paramMap) + case "fit2" => + val models = pipeline.fit(trainingData, paramMaps) + models(0) + case "fit3" => pipeline.fit(trainingData, firstParamPair, otherParamPairs_1st, otherParamPairs_2nd) + + } + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + + val testData = reader + .load(testDataPath) + .repartition(genericPt) + .persist(StorageLevel.MEMORY_AND_DISK_SER) + // Make predictions. + val predictions = model.transform(testData) + // Select (prediction, true label) and compute test error. + val evaluator = params.algorithmType match { + case "classification" => + new MulticlassClassificationEvaluator() + .setLabelCol ("indexedLabel") + .setPredictionCol ("prediction") + .setMetricName ("accuracy") + case "regression" => + new RegressionEvaluator() + .setLabelCol ("indexedLabel") + .setPredictionCol ("prediction") + .setMetricName ("rmse") + } + val res = evaluator.evaluate(predictions) + Utils.saveEvaluation(res, params.saveDataPath, sc) + (res, costTime) + } + + def rfRDDJob(spark: SparkSession, params: RFParams): (Double, Double) = { + + val pt = params.pt + val trainingDataPath = params.trainingDataPath + val testDataPath = params.testDataPath + val numTrees = params.numTrees + var maxDepth = params.maxDepth + val maxBins = params.maxBins + val useNodeIdCache = params.useNodeIdCache + val checkpointInterval = params.checkpointInterval + val maxMemoryInMB = params.maxMemoryInMB + val featureSubsetStrategy = params.featureSubsetStrategy + val genericPt = params.genericPt + var numClasses = params.numClasses + + val sc = spark.sparkContext + val startTime = System.currentTimeMillis() + + val numFeatures = params.datasetName match { + case "mnist8m" => 784 + case "higgs" => 28 + case "epsilon" =>2000 + case "rcv" => 47236 + } + + val numPtTrainData = if ("no" == params.isRaw) genericPt else pt + val trainingData = MLUtils.loadLibSVMFile(sc, trainingDataPath, numFeatures) + .repartition(numPtTrainData) + .persist(StorageLevel.MEMORY_AND_DISK_SER) + val trainingLabelPositive = trainingData.map(i=> if (i.label < 0) { + LabeledPoint(0.0, i.features) + } else { + LabeledPoint (i.label, i.features) + }) + + val model = params.algorithmType match { + case "classification" => + val seed = "org.apache.spark.ml.classification.RandomForestClassifier".hashCode + params.apiName match { + case "train" => + val strategy = new Strategy (Algo.Classification, Gini, maxDepth = maxDepth, + numClasses = numClasses, maxBins = maxBins, useNodeIdCache = useNodeIdCache, + checkpointInterval = checkpointInterval, maxMemoryInMB = maxMemoryInMB) + RandomForest.trainClassifier(trainingLabelPositive, strategy, numTrees, featureSubsetStrategy, seed) + case "train1" => + val categoricalFeaturesInfo = Map[Int, Int]() + RandomForest.trainClassifier(trainingLabelPositive, numClasses, categoricalFeaturesInfo, + numTrees, featureSubsetStrategy, "gini", maxDepth, maxBins, seed) + case "train2" => + val categoricalFeaturesInfo = new java.util.HashMap[java.lang.Integer, java.lang.Integer]() + RandomForest.trainClassifier(trainingLabelPositive.toJavaRDD, numClasses, categoricalFeaturesInfo, + numTrees, featureSubsetStrategy, "gini", maxDepth, maxBins, seed) + } + case "regression" => + val seed = "org.apache.spark.ml.regression.RandomForestRegressor".hashCode + params.apiName match { + case "train" => + val strategy = new Strategy (Algo.Regression, Variance, maxDepth = maxDepth, + numClasses = 0, maxBins = maxBins, useNodeIdCache = useNodeIdCache, + checkpointInterval = checkpointInterval, maxMemoryInMB = maxMemoryInMB) + RandomForest.trainRegressor(trainingLabelPositive, strategy, numTrees, featureSubsetStrategy, seed) + case "train1" => + val categoricalFeaturesInfo = Map[Int, Int]() + RandomForest.trainRegressor(trainingLabelPositive, categoricalFeaturesInfo, + numTrees, featureSubsetStrategy, "variance", maxDepth, maxBins, seed) + case "train2" => + val categoricalFeaturesInfo = new java.util.HashMap[java.lang.Integer, java.lang.Integer]() + RandomForest.trainRegressor(trainingLabelPositive.toJavaRDD, categoricalFeaturesInfo, + numTrees, featureSubsetStrategy, "variance", maxDepth, maxBins, seed) + } + } + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + + val testData = MLUtils.loadLibSVMFile(sc, testDataPath) + .repartition(genericPt) + .persist(StorageLevel.MEMORY_AND_DISK_SER) + val testLabelPositive = testData.map(i=> if (i.label < 0) { + LabeledPoint(0.0, i.features) + } else { + LabeledPoint (i.label, i.features) + }) + val labeleAndPreds = testLabelPositive.map{ point => + val prediction = model.predict(point.features) + (point.label, prediction) + } + val res = params.algorithmType match { + case "classification" => 1.0 - labeleAndPreds.filter(r => r._1 == r._2).count.toDouble / testLabelPositive.count() + case "regression" => math.sqrt(labeleAndPreds.map{ case(v, p) => math.pow((v - p), 2)}.mean()) + } + Utils.saveEvaluation(res, params.saveDataPath, sc) + (res, costTime) + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/SPCARunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/SPCARunner.scala new file mode 100644 index 0000000..7f0b66d --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/SPCARunner.scala @@ -0,0 +1,205 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.MatrixVerify + +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.ml.feature.SPCA +import org.apache.spark.ml.feature.PCA +import org.apache.spark.ml.linalg.Vectors +import org.apache.spark.mllib.linalg.{DenseMatrix, DenseVector} +import org.apache.spark.storage.StorageLevel +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.ml.param.{ParamMap, ParamPair} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.io.{File, FileWriter} +import java.util +import scala.beans.BeanProperty + +class SPCAConfig extends Serializable { + + @BeanProperty var spca: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class SPCAParams extends Serializable { + + @BeanProperty var pt: Int = _ + @BeanProperty var k: Int = _ + @BeanProperty var sep: String = _ + @BeanProperty var numCols: Int = _ + @BeanProperty var pcPath: String = _ + @BeanProperty var sigmaPath: String = _ + + @BeanProperty var dataPath: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var loadDataTime: Double = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object SPCARunner { + def main(args: Array[String]): Unit = { + + try { + val modelConfSplit = args(0).split("-") + val (datasetName, apiName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2), modelConfSplit(3)) + val dataPath = args(1) + val cpuName = args(2) + val sparkConfSplit = args(3).split("_") + val (master, deployMode, numExec, execCores, execMem) = + (sparkConfSplit(0), sparkConfSplit(1), sparkConfSplit(2), sparkConfSplit(3), sparkConfSplit(4)) + val saveResultPath = args(4) + + val stream = Utils.getStream("conf/ml/spca/spca.yml") + val representer = new Representer + representer.addClassTag(classOf[SPCAParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[SPCAConfig]), representer, options) + val description = new TypeDescription(classOf[SPCAParams]) + yaml.addTypeDescription(description) + val configs: SPCAConfig = yaml.load(stream).asInstanceOf[SPCAConfig] + val params = new SPCAParams() + val paramsMap: util.HashMap[String, Object] = configs.spca.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + params.setPt(paramsMap.getOrDefault("pt", "250").asInstanceOf[Int]) + params.setK(paramsMap.getOrDefault("k", "10").asInstanceOf[Int]) + params.setSep(paramsMap.getOrDefault("sep", " ").asInstanceOf[String]) + params.setNumCols(paramsMap.getOrDefault("numCols", "0").asInstanceOf[Int]) + params.setPcPath(paramsMap.getOrDefault("pcPath", null.asInstanceOf[String]).asInstanceOf[String]) + params.setSigmaPath(paramsMap.getOrDefault("sigmaPath", null.asInstanceOf[String]).asInstanceOf[String]) + params.setDataPath(dataPath) + params.setDatasetName(datasetName) + params.setApiName(apiName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("SPCA") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}_${apiName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}_${apiName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${datasetName}_${apiName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + val conf = new SparkConf().setAppName(appName).setMaster(master) + val commonParas = Array ( + ("spark.submit.deployMode", deployMode), + ("spark.executor.instances", numExec), + ("spark.executor.cores", execCores), + ("spark.executor.memory", execMem) + ) + conf.setAll(commonParas) + val spark = SparkSession.builder.config(conf).getOrCreate() + + val costTime = new SPCAKernel().runJob(spark, params) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(MatrixVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${params.getCostTime}s;isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class SPCAKernel { + + def runJob(spark: SparkSession, params: SPCAParams): Double = { + + import spark.implicits._ + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val startTime = System.currentTimeMillis() + val trainingData = if (params.isRaw == "yes"){ + val numColsBC = sc.broadcast(params.numCols) + val sepBC = sc.broadcast(params.sep) + val data = spark.createDataFrame(sc.textFile(params.dataPath, params.pt) + .map(line => { + val entry = line.split(sepBC.value) + (entry(0).toInt, (entry(1).toInt, entry(2).toDouble)) + }).groupByKey() + .map { case (_, vectorEntries) => Vectors.sparse(numColsBC.value, vectorEntries.toSeq) } + .repartition(params.pt) + .map(Tuple1.apply)) + .toDF("matrix").persist(StorageLevel.MEMORY_ONLY) + data + } else { + val data = spark.createDataFrame(sc.textFile(params.dataPath, params.pt) + .map(line => { + val entry = line.split(params.sep) + (entry(0).toInt, (entry(1).toInt, entry(2).toDouble)) + }).groupByKey() + .map{case (_, vectorEntries) => Vectors.sparse(params.numCols, vectorEntries.toSeq)} + .repartition(params.pt) + .map(Tuple1.apply)) + .toDF("matrix") + .persist(StorageLevel.MEMORY_ONLY) + data + } + + val loadDataTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setLoadDataTime(loadDataTime) + + val spca = params.isRaw match { + case "yes" => new PCA().setK(params.k).setInputCol("matrix") + case "no" => new SPCA().setK(params.k).setInputCol("matrix") + } + + val paramMap = ParamMap(spca.k -> params.k) + .put(spca.inputCol, "matrix") + val paramMaps: Array[ParamMap] = new Array[ParamMap](2) + for (i <- 0 to paramMaps.size - 1) { + paramMaps(i) = ParamMap(spca.k -> params.k) + .put(spca.inputCol, "matrix") + } + val kPair = ParamPair(spca.k, params.k) + val inputColPair = ParamPair(spca.inputCol, "matrix") + val model = params.apiName match { + case "fit" => spca.fit(trainingData) + case "fit1" => spca.fit(trainingData, paramMap) + case "fit2" => + val models = spca.fit(trainingData, paramMaps) + models(0) + case "fit3" => spca.fit(trainingData, kPair, inputColPair) + } + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setLoadDataTime(costTime) + + val spcaMat = new DenseMatrix(model.pc.numRows, model.pc.numCols, model.pc.values, model.pc.isTransposed) + MatrixVerify.saveMatrix(spcaMat, params.saveDataPath, sc) + costTime + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/SVDRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/SVDRunner.scala new file mode 100644 index 0000000..8e52105 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/SVDRunner.scala @@ -0,0 +1,174 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.SVDVerify + +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.mllib.linalg.{DenseMatrix, DenseVector, SparseVector, Vector, Vectors} +import org.apache.spark.mllib.linalg.distributed.RowMatrix +import org.apache.spark.storage.StorageLevel +import org.apache.hadoop.fs.{FileSystem, Path} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer +import breeze.linalg.{scale, DenseMatrix => BDM, DenseVector => BDV, norm => brzNorm} + +import java.io.{File, FileWriter, PrintWriter} +import java.util +import scala.beans.BeanProperty + +class SVDConfig extends Serializable { + + @BeanProperty var svd: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class SVDParams extends Serializable { + + @BeanProperty var pt: Int = _ + @BeanProperty var k: Int = _ + @BeanProperty var sep: String = _ + @BeanProperty var dataFormat: String = _ + @BeanProperty var numCols: Int = _ + @BeanProperty var numRows: Int = _ + + @BeanProperty var dataPath: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var loadDataTime: Double = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object SVDRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + val dataPath = args(1) + val cpuName = args(2) + val sparkConfSplit = args(3).split("_") + val (master, deployMode, numExec, execCores, execMem) = + (sparkConfSplit(0), sparkConfSplit(1), sparkConfSplit(2), sparkConfSplit(3), sparkConfSplit(4)) + val saveResultPath = args(4) + + val stream = Utils.getStream("conf/ml/svd/svd.yml") + val representer = new Representer + representer.addClassTag(classOf[SVDParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[SVDConfig]), representer, options) + val description = new TypeDescription(classOf[SVDParams]) + yaml.addTypeDescription(description) + val configs: SVDConfig = yaml.load(stream).asInstanceOf[SVDConfig] + val paramsMap: util.HashMap[String, Object] = configs.svd.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + val params = new SVDParams() + params.setPt(paramsMap.getOrDefault("pt", "250").asInstanceOf[Int]) + params.setK(paramsMap.getOrDefault("k", "10").asInstanceOf[Int]) + params.setSep(paramsMap.getOrDefault("sep", ",").asInstanceOf[String]) + params.setDataFormat(paramsMap.getOrDefault("dataFormat", "dense").asInstanceOf[String]) + params.setNumCols(paramsMap.getOrDefault("numCols", "0").asInstanceOf[Int]) + params.setNumRows(paramsMap.getOrDefault("numRows", "0").asInstanceOf[Int]) + params.setDataPath(dataPath) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("SVD") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${datasetName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName).setMaster(master) + val commonParas = Array( + ("spark.submit.deployMode", deployMode), + ("spark.executor.instances", numExec), + ("spark.executor.cores", execCores), + ("spark.executor.memory", execMem) + ) + conf.setAll(commonParas) + val spark = SparkSession.builder.config(conf).getOrCreate() + + val costTime = new SVDKernel().runJob(spark, params) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(SVDVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s;isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} +class SVDKernel { + def runJob(spark: SparkSession, params: SVDParams): Double = { + + import spark.implicits._ + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val startTime = System.currentTimeMillis() + val numColsBC = sc.broadcast(params.numCols) + val sepBC = sc.broadcast(params.sep) + val trainingData = if (params.dataFormat == "coo") { + sc.textFile(params.dataPath, params.pt) + .map(line => { + val entry = line.split(sepBC.value) + (entry(0).toInt, (entry(1).toInt, entry(2).toDouble)) + }).groupByKey() + .map{case (_, vectorEntries) => {Vectors.sparse(numColsBC.value, vectorEntries.toSeq)}} + .persist(StorageLevel.MEMORY_ONLY) + } else { + sc.textFile(params.dataPath) + .map(row => Vectors.dense(row.split(sepBC.value).map(_.toDouble))) + .repartition(params.pt) + .persist(StorageLevel.MEMORY_ONLY) + } + + val loadDataTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setLoadDataTime(loadDataTime) + + val matrix = new RowMatrix(trainingData, params.numRows, params.numCols) + val model = matrix.computeSVD(params.k, computeU = true) + model.U.rows.count() + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + + val sigmaPath = s"${params.saveDataPath}/s" + val VPath = s"${params.saveDataPath}/V" + val s = model.s.asInstanceOf[DenseVector] + val V = model.V.asInstanceOf[DenseMatrix] + SVDVerify.saveVector(s, sigmaPath, sc) + SVDVerify.saveMatrix(V, VPath, sc) + + costTime + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/SVMRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/SVMRunner.scala new file mode 100644 index 0000000..0ee4f26 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/SVMRunner.scala @@ -0,0 +1,187 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.UpEvaluationVerify + +import org.apache.spark.SparkConf +import org.apache.spark.ml.param.{ParamMap, ParamPair} +import org.apache.spark.ml.classification.LinearSVC +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.sql.{SparkSession, Encoders} +import org.apache.spark.sql.functions.when +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.util +import java.io.{File, FileWriter} +import java.util.HashMap +import scala.beans.BeanProperty + +class SVMConfig extends Serializable { + @BeanProperty var svm: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class SVMParams extends Serializable { + @BeanProperty var numPartitions: Int = _ + @BeanProperty var regParam: Double = _ + @BeanProperty var maxIter: Int = _ + @BeanProperty var tolerance: Double = _ + + @BeanProperty var trainingDataPath: String = _ + @BeanProperty var testDataPath: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var evaluation: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object SVMRunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, apiName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2), modelConfSplit(3)) + val dataPath = args(1) + val dataPathSplit = dataPath.split(",") + val (trainingDataPath, testDataPath) = (dataPathSplit(0), dataPathSplit(1)) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/svm/svm.yml") + val representer = new Representer + representer.addClassTag(classOf[SVMParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[SVMConfig]), representer, options) + val description = new TypeDescription(classOf[SVMParams]) + yaml.addTypeDescription(description) + val config: SVMConfig = yaml.load(stream).asInstanceOf[SVMConfig] + val paramsMap: util.HashMap[String, Object] = config.svm.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + val params = new SVMParams() + params.setNumPartitions(paramsMap.get("numPartitions").asInstanceOf[Int]) + params.setMaxIter(paramsMap.get("maxIter").asInstanceOf[Int]) + params.setRegParam(paramsMap.get("regParam").asInstanceOf[Double]) + params.setTolerance(paramsMap.get("tolerance").asInstanceOf[Double]) + params.setApiName(apiName) + params.setTrainingDataPath(trainingDataPath) + params.setTestDataPath(testDataPath) + params.setCpuName(cpuName) + params.setDatasetName(datasetName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("SVM") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}_${apiName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}_${apiName}" + if (isRaw.equals("yes")) { + appName = s"${params.algorithmName}_${datasetName}_${apiName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder.config(conf).getOrCreate() + + val (evaluation, costTime) = new SVMKernel().runJob(spark, params) + params.setEvaluation(evaluation) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(UpEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s; evaluation: ${evaluation};isCorrect: ${params.isCorrect}") + }catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class SVMKernel { + case class SVCSchema(label: Int, features: Vector) + def runJob(spark: SparkSession,params: SVMParams): (Double, Double) = { + import spark.implicits._ + val sc = spark.sparkContext + val startTime = System.currentTimeMillis() + val parsedData = if (params.datasetName == "rcv") { + spark.read.format("libsvm").option("numFeatures", 47236).load(params.trainingDataPath) + .withColumn("label", when($"label" === -1.0, 0.0).otherwise($"label")) + .repartition(params.numPartitions).cache() + } else if (params.datasetName == "epsilon") { + spark.read.format("libsvm").option("numFeatures", 2000).load(params.trainingDataPath) + .withColumn("label", when($"label" === -1.0, 0.0).otherwise($"label")) + .repartition(params.numPartitions).cache() + } else { + spark.read.schema(Encoders.product[SVCSchema].schema).format("orc").load(params.trainingDataPath) + .repartition(params.numPartitions).cache() + } + println(s"trainingData: ${parsedData.count()}") + + val svm = new LinearSVC() + .setMaxIter(params.maxIter) + .setRegParam(params.regParam) + .setTol(params.tolerance) + + val paramMap = ParamMap(svm.maxIter -> params.maxIter) + .put(svm.regParam, params.regParam) + val paramMaps: Array[ParamMap] = new Array[ParamMap](2) + for (i <- 0 to paramMaps.size -1) { + paramMaps(i) = ParamMap(svm.maxIter -> params.maxIter) + .put(svm.regParam, params.regParam) + } + val maxIterParamPair = ParamPair(svm.maxIter, params.maxIter) + val regParamPair = ParamPair(svm.regParam, params.regParam) + val model = params.apiName match { + case "fit" => svm.fit(parsedData) + case "fit1" => svm.fit(parsedData, paramMap) + case "fit2" => + val models = svm.fit(parsedData, paramMaps) + models(0) + case "fit3" => svm.fit(parsedData, maxIterParamPair, regParamPair) + } + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + + val parsedTest = if (params.datasetName == "rcv") { + spark.read.format("libsvm").option("numFeatures", 47236).load(params.testDataPath) + .withColumn("label", when($"label" === -1.0, 0.0).otherwise($"label")) + .repartition(params.numPartitions).cache() + } else if (params.datasetName == "epsilon") { + spark.read.format("libsvm").option("numFeatures", 2000).load(params.testDataPath) + .withColumn("label", when($"label" === -1.0, 0.0).otherwise($"label")) + .repartition(params.numPartitions).cache() + } else { + spark.read.schema(Encoders.product[SVCSchema].schema).format("orc").load(params.testDataPath) + .repartition(params.numPartitions).cache() + } + + val result = model.transform(parsedTest) + val res = result.filter($"label"===$"prediction").count().toDouble/result.count + Utils.saveEvaluation(res, params.saveDataPath, sc) + (res, costTime) + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/SimRankRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/SimRankRunner.scala new file mode 100644 index 0000000..cd7c70f --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/SimRankRunner.scala @@ -0,0 +1,154 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.SimRankVerify + +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.ml.recommendation.ALS.Rating +import org.apache.spark.ml.recommendation.{SimRank, SimRankOpenSource} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.util +import java.io.{File, FileWriter} +import java.util.HashMap +import scala.beans.BeanProperty + +class SimRankConfig extends Serializable { + + @BeanProperty var simrank: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class SimRankParams extends Serializable { + + @BeanProperty var numPartitions: Int = _ + @BeanProperty var damp: Double = _ + @BeanProperty var maxIter: Int = _ + + @BeanProperty var isRaw: String = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var dataPath: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var loadDataTime: Double = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} +object SimRankRunner { + def main(args: Array[String]): Unit = { + + try { + val modelConfSplit = args(0).split("-") + val (datasetName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + val dataPath = args(1) + val cpuName = args(2) + val saveResultPath = args(3) + val datasetCpuName = s"${datasetName}-${cpuName}" + + val stream = Utils.getStream("conf/ml/simrank/simrank.yml") + val representer = new Representer + representer.addClassTag(classOf[SimRankParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[SimRankConfig]), representer, options) + val description = new TypeDescription(classOf[SimRankParams]) + yaml.addTypeDescription(description) + val config: SimRankConfig = yaml.load(stream).asInstanceOf[SimRankConfig] + val params = new SimRankParams() + val paramsMap = config.simrank.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetCpuName) + params.setNumPartitions(paramsMap.get("numPartitions").asInstanceOf[Int]) + params.setMaxIter(paramsMap.get("maxIter").asInstanceOf[Int]) + params.setDamp(paramsMap.get("damp").asInstanceOf[Double]) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setDataPath(dataPath) + params.setDatasetName(datasetName) + params.setIfCheck(ifCheck) + params.setAlgorithmName("SimRank") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${datasetName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder.config(conf).getOrCreate() + val costTime = new SimRankKernel().runJob(spark, params) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(SimRankVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s;isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + + +class SimRankKernel { + + def runJob(spark: SparkSession, params: SimRankParams): Double = { + val sc = spark.sparkContext + val startTime = System.currentTimeMillis() + var costTime: Double = 0 + + import spark.implicits._ + val userCol = "user" + val itemCol = "item" + val df = spark.sparkContext.objectFile[Rating[Int]](params.getDataPath).repartition(params.getNumPartitions) + .map(row => { + ("user-" + row.user.toString, "item-" + row.item.toString) + }).toDF(userCol, itemCol) + + val loadDataTime = (System.currentTimeMillis() - startTime) / 1000.0 + params.setLoadDataTime(loadDataTime) + if (params.getIsRaw.equals("no")) { + val simrank = new SimRank() + .setDamp(params.getDamp) + .setNumIter(params.getMaxIter) + .setUserCol(userCol) + .setItemCol(itemCol) + val simrankRes = simrank.computeSimilarity(df) + simrankRes.itemSimilarity.foreach(_ => {}) + simrankRes.userSimilarity.foreach(_ => {}) + costTime = (System.currentTimeMillis() - startTime) / 1000.0 + SimRankVerify.saveRes(simrankRes.userSimilarity, simrankRes.itemSimilarity, params.saveDataPath, sc) + + } else { + val simrankRes = new SimRankOpenSource().execute(df, (userCol, itemCol), params.getDamp, params.getMaxIter) + simrankRes._1.foreach(_ => {}) + simrankRes._2.foreach(_ => {}) + costTime = (System.currentTimeMillis() - startTime) / 1000.0 + SimRankVerify.saveRes(simrankRes._1, simrankRes._2, params.saveDataPath, sc) + } + costTime + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/SpearManRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/SpearManRunner.scala new file mode 100644 index 0000000..66f6042 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/SpearManRunner.scala @@ -0,0 +1,190 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.MatrixVerify + +import org.apache.spark.SparkConf +import org.apache.spark.ml +import org.apache.spark.ml.linalg.SQLDataTypes.VectorType +import org.apache.spark.ml.linalg.{Matrix, Vectors, Vector} +import org.apache.spark.mllib.linalg.DenseMatrix +import org.apache.spark.mllib +import org.apache.spark.mllib.linalg.{Vectors => OldVectors} +import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.types.{StructField, StructType} +import org.apache.spark.storage.StorageLevel +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.rdd.RDD +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.io.{File, FileWriter} +import java.util +import scala.beans.BeanProperty + +class SpearManConfig extends Serializable { + + @BeanProperty var spearman: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class SpearManParams extends Serializable { + + @BeanProperty var numPartitions: Int = _ + + @BeanProperty var dataPath: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var dataStructure: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var resultSum: Double = _ + @BeanProperty var resultAvg: Double = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object SpearManRunner { + + def main(args: Array[String]): Unit = { + + try { + val modelConfSplit = args(0).split("-") + val (dataStructure, datasetName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2), modelConfSplit(3)) + val dataPath = args(1) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/spearman/spearman.yml") + val representer = new Representer + representer.addClassTag(classOf[SpearManParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[SpearManConfig]), representer, options) + val description = new TypeDescription(classOf[SpearManParams]) + yaml.addTypeDescription(description) + val config: SpearManConfig = yaml.load(stream).asInstanceOf[SpearManConfig] + val paramsMap: util.HashMap[String, Object] = config.spearman.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + val params = new SpearManParams() + params.setNumPartitions(paramsMap.get("numPartitions").asInstanceOf[Int]) + params.setDataPath(dataPath) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setDataStructure(dataStructure) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("SpearMan") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}_${dataStructure}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${dataStructure}_${datasetName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${dataStructure}_${datasetName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder.config(conf).getOrCreate() + val (resultSum,resultAvg,costTime) = dataStructure match { + case "dataframe" => + new SpearManKernel().runDataframeJob(spark, params) + case "rdd" => + new SpearManKernel().runRddJob(spark, params) + } + params.setResultSum(resultSum) + params.setResultAvg(resultAvg) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(MatrixVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${params.getCostTime}s;isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class SpearManKernel { + def runDataframeJob(spark: SparkSession,params: SpearManParams): (Double, Double, Double) = { + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val startTime = System.currentTimeMillis() + + val data = spark.createDataFrame( + spark + .sparkContext + .textFile(params.dataPath) + .map(x=>Row(Vectors.dense(x.split(",").map(_.toDouble)))) + .repartition(params.numPartitions), + StructType(List(StructField("matrix", VectorType))) + ).persist(StorageLevel.MEMORY_ONLY) + + val mat_df = ml.stat.Correlation.corr(data,"matrix", method = "spearman") + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + + val mat = mat_df.collect()(0).getAs[Matrix](0) + val result = mat.toArray + val result_avg = result.sum/result.length + val spearManMat = new DenseMatrix(mat.numRows, mat.numCols, mat.toArray, mat.isTransposed) + + + MatrixVerify.saveMatrix(spearManMat, params.saveDataPath, sc) + (result.sum, result_avg, costTime) + } + + def runRddJob(spark: SparkSession,params: SpearManParams): (Double, Double, Double) = { + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + + val startTime = System.currentTimeMillis() + + val data = spark.createDataFrame( + spark + .sparkContext + .textFile(params.dataPath) + .map(x=>Row(Vectors.dense(x.split(",").map(_.toDouble)))) + .repartition(params.numPartitions), + StructType(List(StructField("matrix", VectorType))) + ).persist(StorageLevel.MEMORY_ONLY) + + val rdd = data.select("matrix").rdd.map{ + case Row(v: Vector) => OldVectors.fromML(v) + } + val mat_rdd = mllib.stat.Statistics.corr(rdd, method = "spearman") + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + + val result = mat_rdd.toArray + val result_avg = result.sum/result.length + val spearManMat = mat_rdd.asInstanceOf[DenseMatrix] + + MatrixVerify.saveMatrix(spearManMat, params.saveDataPath, sc) + (result.sum, result_avg, costTime) + } +} + + + + diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/TERunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/TERunner.scala new file mode 100644 index 0000000..1abf7e4 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/TERunner.scala @@ -0,0 +1,339 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.utils.TimeUtils +import com.bigdata.compare.ml.TEVerify + +import ai.h2o.sparkling.{H2OConf, H2OContext} +import ai.h2o.sparkling.ml.features.H2OTargetEncoder +import ai.h2o.sparkling.ml.models.H2OTargetEncoderModel +import org.apache.spark.ml.feature.{TargetEncoder, TargetEncoderModel} +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.SparkConf +import org.apache.spark.sql.types.{IntegerType, StructField, StructType} +import org.apache.spark.sql.{Dataset, DataFrame, Encoders, SaveMode, SparkSession} +import org.apache.spark.sql.functions.{col, countDistinct} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.io.FileWriter +import java.util +import java.util.Date +import scala.beans.BeanProperty + + +class TEConfig extends Serializable { + @BeanProperty var te: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class TEParams extends Serializable { + @BeanProperty var numPartitions: Int = _ + @BeanProperty var problemType: String = _ + @BeanProperty var targetColName: String = _ + @BeanProperty var blendedAvgSmoothing: Int = _ + @BeanProperty var blendedAvgInflectionPoint: Int = _ + + @BeanProperty var trainingDataPath: String = _ + @BeanProperty var testDataPath: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var startTime: Long = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object TERunner { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, isRaw, ifCheck) = { + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + } + val dataPath = args(1) + val dataPathSplit = dataPath.split(",") + val (trainingDataPath, testDataPath) = (dataPathSplit(0), dataPathSplit(1)) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/te/te.yml") + val representer = new Representer + representer.addClassTag(classOf[TEParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[TEConfig]), representer, options) + val description = new TypeDescription(classOf[TEParams]) + yaml.addTypeDescription(description) + val configs: TEConfig = yaml.load(stream).asInstanceOf[TEConfig] + val paramsMap: util.HashMap[String, Object] = configs.te.get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(datasetName) + val params = new TEParams() + params.setNumPartitions(paramsMap.get("numPartitions").asInstanceOf[Int]) + params.setProblemType(paramsMap.get("problemType").asInstanceOf[String]) + params.setTargetColName(paramsMap.get("targetColName").asInstanceOf[String]) + params.setBlendedAvgSmoothing(paramsMap.get("blendedAvgSmoothing").asInstanceOf[Int]) + params.setBlendedAvgInflectionPoint(paramsMap.get("blendedAvgInflectionPoint").asInstanceOf[Int]) + params.setTrainingDataPath(trainingDataPath) + params.setTestDataPath(testDataPath) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("TargetEncoder") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${datasetName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder.config(conf).getOrCreate() + spark.sparkContext.setLogLevel("ERROR") + val costTime = new TEKernel().runJob(spark, params) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + params.setIsCorrect(TEVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s;isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class TEKernel { + def runJob(spark: SparkSession, params: TEParams): Double = { + val sc = spark.sparkContext + val timer = new TimeUtils.Timer("main").start() + val targetColName = params.targetColName + val problemType = params.problemType + val numPartitions = params.numPartitions + val startTime = System.currentTimeMillis() + + import spark.implicits._ + val encodedTrainSavePath = s"${params.saveDataPath}/encodedTrain" + val encodedTestSavePath = s"${params.saveDataPath}/encodedTest" + val trainSchema = getStructTypeWithFoldCol(getSchemaByName(params.datasetName)) + var trainDF = spark.read.option("header", false) + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .schema(trainSchema) + .csv(params.trainingDataPath) + trainDF = trainDF.na.fill(0).na.fill("MISSING").repartition(numPartitions).cache() + trainDF.rdd.foreachPartition(_ => {}) + println(s"trainingDF.numpartitions: ${trainDF.rdd.getNumPartitions}") + timer.tic("[load & preprocessTrain]") + + val (catFeaturesSFs, numFeaturesSFs) = trainDF.schema + .filter(_.name != targetColName) + .filter(_.name != "foldCol") + .partition(_.dataType.toString == "StringType") + val catFeatures = catFeaturesSFs.map(_.name).toArray + val numFeatures = numFeaturesSFs.map(_.name).toArray + val featureCols = (numFeatures ++ catFeatures.map(x => s"${x}_te")).map(col(_)) + + showCardinality(catFeatures, trainDF) + if (params.isRaw == "yes") { + val conf = new H2OConf().setLogLevel("WARN") + H2OContext.getOrCreate(conf) + } + + timer.tic("[init]") + timer.report() + var encoderModel: Any = null + val encodedTrain = if (params.isRaw == "yes") { + var encoder = new H2OTargetEncoder() + .setInputCols(catFeatures) + .setHoldoutStrategy("KFold") + .setProblemType(problemType) + .setFoldCol("foldCol") + .setLabelCol(targetColName) + .setNoise(0.0) + .setBlendedAvgEnabled(true) + .setBlendedAvgSmoothing(params.blendedAvgSmoothing) + .setBlendedAvgInflectionPoint(params.blendedAvgInflectionPoint) + .fit(trainDF) + timer.tic("[fit]") + encoderModel = encoder + encoder.transformTrainingDataset(trainDF).select((featureCols :+ col(targetColName)): _ * ) + } else { + var encoder = new TargetEncoder() + .setInputCols(catFeatures) + .setProblemType(problemType) + .setFoldCol("foldCol") + .setLabelCol(targetColName) + .setBlendedAvgSmoothing(20) + .setBlendedAvgInflectionPoint(10) + .fit(trainDF) + timer.tic("[fit]") + encoderModel = encoder + encoder.transformTrainingDataset(trainDF).select((featureCols :+ col(targetColName)): _ * ) + } + encodedTrain.write.mode(SaveMode.Overwrite).parquet(encodedTrainSavePath) + timer.tic("[transformTrain]") + timer.report() + + println("Transforming test dataset for functionality test. This is not included in training time.") + val testSchema = getSchemaByName(params.datasetName) + var testDF = spark.read.option("header", false) + .option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .schema(testSchema) + .csv(params.testDataPath) + testDF = testDF.na.fill(0).na.fill("MISSING").repartition(numPartitions).cache() + testDF.rdd.foreachPartition(_ => {}) + timer.tic("[load & preprocessTest]") + val encodedTest = if (params.isRaw == "yes") { + val encoder = encoderModel.asInstanceOf[H2OTargetEncoderModel] + val encodedTest = encoder.transform(testDF).select((featureCols :+ col(targetColName)): _ * ) + timer.tic("[transformTest]") + encodedTest + } else { + val encoder = encoderModel.asInstanceOf[TargetEncoderModel] + val encodedTest = encoder.transform(testDF).select((featureCols :+ col(targetColName)): _ * ) + timer.tic("[transformTest]") + encodedTest + } + encodedTest.write.mode(SaveMode.Overwrite).parquet(encodedTestSavePath) + val costTime = (System.currentTimeMillis() - startTime) / 1000 + println("Transform test complete.") + costTime + } + + def showCardinality(catFeatures: Array[String], trainingDF: Dataset[_]): Unit = { + val cards = catFeatures.map { catF => + countDistinct(col(catF)) + } + val totalCardinality = trainingDF.select(cards: _ * ).first().toSeq.map(_.asInstanceOf[Long]).sum + println(s"Number of instances: ${trainingDF.count()}, Total cardinalities: ${totalCardinality}.") + trainingDF.show(10) + } + + def getSchemaByName(name: String): StructType = { + val lowerCaseName = name.toLowerCase() + if (lowerCaseName.startsWith("criteo")) { + Encoders.product[Criteo].schema + } else if (lowerCaseName == "click") { + Encoders.product[Click].schema + } else if (lowerCaseName == "taobao") { + Encoders.product[Taobao].schema + } else if (lowerCaseName == "movielens") { + Encoders.product[MovieLens].schema + } else if (lowerCaseName == "adult") { + Encoders.product[Adult].schema + } else null + } + + def getStructTypeWithFoldCol(tpe: StructType): StructType = { + StructType(tpe.toArray :+ StructField("foldCol", IntegerType, false)) + } + +} + +case class Adult( + age: Option[Double], + workclass: Option[String], + fnlwgt: Option[Double], + education: Option[String], + education_num: Option[Double], + marital_status: Option[String], + occupation: Option[String], + relationship: Option[String], + race: Option[String], + sex: Option[String], + capitalgain: Option[Double], + capitalloss: Option[Double], + hoursperweek: Option[Double], + native_country: Option[String], + target: Double) + +case class Click( + target: Double, + var1: Option[String], + var2: Option[String], + var3: Option[String], + var4: Option[String], + var5: Option[Double], + var6: Option[Double], + var7: Option[String], + var8: Option[String], + var9: Option[String], + var10: Option[String], + var11: Option[String]) + +case class Criteo( + target: Double, + var1: Option[Double], + var2: Option[Double], + var3: Option[Double], + var4: Option[Double], + var5: Option[Double], + var6: Option[Double], + var7: Option[Double], + var8: Option[Double], + var9: Option[Double], + var10: Option[Double], + var11: Option[Double], + var12: Option[Double], + var13: Option[Double], + var14: Option[String], + var15: Option[String], + var16: Option[String], + var17: Option[String], + var18: Option[String], + var19: Option[String], + var20: Option[String], + var21: Option[String], + var22: Option[String], + var23: Option[String], + var24: Option[String], + var25: Option[String], + var26: Option[String], + var27: Option[String], + var28: Option[String], + var29: Option[String], + var30: Option[String], + var31: Option[String], + var32: Option[String], + var33: Option[String], + var34: Option[String], + var35: Option[String], + var36: Option[String], + var37: Option[String], + var38: Option[String], + var39: Option[String]) + +case class Taobao( + userID: Option[String], + itemID: Option[String], + categoryID: Option[String], + target: Double) + +case class MovieLens( + userID: Option[String], + movieID: Option[String], + target: Double) \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/Word2VecRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/Word2VecRunner.scala new file mode 100644 index 0000000..f3ddd19 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/Word2VecRunner.scala @@ -0,0 +1,185 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.Word2VecEvaluation + +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.sql.SparkSession +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import org.apache.spark.ml.feature.{Word2Vec, Word2VecModel} +import org.apache.spark.ml.param.{ParamMap, ParamPair} +import org.apache.spark.mllib.feature.{Word2VecModel => mllibWord2VecModel} + +import java.io.{File, FileWriter} +import java.util +import scala.beans.BeanProperty + +class Word2VecConfig extends Serializable { + @BeanProperty var word2vec: util.HashMap[String, util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]]] = _ +} + +class Word2VecParams extends Serializable { + @BeanProperty var numPartitions: Int = _ + @BeanProperty var vectorSize: Int = _ + @BeanProperty var minCount: Int = _ + @BeanProperty var window: Int = _ + @BeanProperty var numIterations: Int = _ + @BeanProperty var learningRate: Double = _ + @BeanProperty var regularization: Double = _ + @BeanProperty var repetition: Int = _ + @BeanProperty var eval: String = _ + @BeanProperty var apiName: String = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var metric: Double = _ + + @BeanProperty var sentenceFile: String = _ + @BeanProperty var downstreamTrainFile: String = _ + @BeanProperty var downstreamTestFile: String = _ + @BeanProperty var fieldName: String = _ + @BeanProperty var isRaw: String = _ +} + +object Word2VecRunner { + def main(args: Array[String]): Unit = { + try { + val sentenceFile = args(0) + val downstreamTrainFile = args(1) + val downstreamTestFile = args(2) + val dataSet = args(3).split("-") + val (fieldName, apiName, scalaVersion) = (dataSet(0), dataSet(1), dataSet(2)) + val isRaw = args(4) + val sparkConfSplit = args(5).split("_") + val (master, deployMode, numExec, execCores, execMem) = + (sparkConfSplit(0), sparkConfSplit(1), sparkConfSplit(2), sparkConfSplit(3), sparkConfSplit(4)) + + val stream = Utils.getStream("conf/ml/word2vec/word2vec.yml") + val typeRaw = isRaw match { + case "no" => "opt" + case "yes" => "raw" + } + + val representer = new Representer + representer.addClassTag(classOf[Word2VecParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[Word2VecConfig]), representer, options) + val description = new TypeDescription(classOf[Word2VecParams]) + yaml.addTypeDescription(description) + + val configs: Word2VecConfig = yaml.load(stream).asInstanceOf[Word2VecConfig] + val params = new Word2VecParams() + val paramsMap: util.HashMap[String, Object] = configs.word2vec.get(typeRaw).get("scala" + scalaVersion).get(fieldName) + + params.setSentenceFile(sentenceFile) + params.setDownstreamTrainFile(downstreamTrainFile) + params.setDownstreamTestFile(downstreamTestFile) + params.setFieldName(fieldName) + params.setIsRaw(isRaw) + params.setApiName(apiName) + params.setNumPartitions(paramsMap.get("numPartitions").asInstanceOf[Int]) + params.setVectorSize(paramsMap.get("vectorSize").asInstanceOf[Int]) + params.setMinCount(paramsMap.get("minCount").asInstanceOf[Int]) + params.setWindow(paramsMap.get("window").asInstanceOf[Int]) + params.setNumIterations(paramsMap.get("numIterations").asInstanceOf[Int]) + params.setLearningRate(paramsMap.get("learningRate").asInstanceOf[Double]) + params.setEval(paramsMap.get("eval").asInstanceOf[String]) + val conf = new SparkConf().setAppName(s"word2vec_${fieldName}_${apiName}").setMaster(master) + val commonParas = Array ( + ("spark.submit.deployMode", deployMode), + ("spark.executor.instances", numExec), + ("spark.executor.cores", execCores), + ("spark.executor.memory", execMem) + ) + conf.setAll(commonParas) + + if ("no" == isRaw.asInstanceOf[String]) { + params.setRegularization(paramsMap.get("regularization").asInstanceOf[Double]) + params.setRepetition(paramsMap.get("repetition").asInstanceOf[Int]) + conf.set("spark.boostkit.mllib.feature.word2vec.regularization", params.regularization.toString) + conf.set("spark.boostkit.mllib.feature.word2vec.repetition", params.repetition.toString) + } + val sc = new SparkContext(conf) + + val startTime = System.currentTimeMillis() + + val spark = SparkSession.builder.config(conf).getOrCreate() + import spark.implicits._ + val sentences = sc.objectFile[Array[String]](sentenceFile, params.numPartitions) + .map(_.toSeq) + .cache() + .setName("ori") + .toDF("sentences") + println(s"********** read data, sentenceCnt=${sentences.count()}, time=${(System.currentTimeMillis() - startTime) / 60000.0} **********") + + val exeTime = System.currentTimeMillis() + val w2v = new Word2Vec() + .setInputCol("sentences") + .setNumPartitions(params.numPartitions) + .setWindowSize(params.window) + .setStepSize(params.learningRate) + .setVectorSize(params.vectorSize) + .setMaxIter(params.numIterations) + + val paramMap = ParamMap(w2v.minCount -> params.minCount) + .put(w2v.windowSize, params.window) + + val paramMaps: Array[ParamMap] = new Array[ParamMap](2) + for (i <- 0 to paramMaps.size -1) { + paramMaps(i) = ParamMap(w2v.minCount -> params.minCount) + .put(w2v.windowSize, params.window) + } + + val learnParamPair = ParamPair(w2v.stepSize, params.learningRate) + val maxIterParamPair = ParamPair(w2v.maxIter, params.numIterations) + val maxVectorParamPair = ParamPair(w2v.vectorSize, params.vectorSize) + + val model = params.apiName match { + case "fit1" => w2v.fit(sentences, paramMap) + case "fit2" => + val models = w2v.fit(sentences, paramMaps) + models(0) + case "fit3" => w2v.fit(sentences, learnParamPair, maxIterParamPair, maxVectorParamPair) + case _ => w2v.fit(sentences) + } + println(s"********** train time=${(System.currentTimeMillis() - exeTime) / 60000.0} **********") + + val ju = scala.reflect.runtime.universe + val mirror = ju.runtimeMirror(model.getClass.getClassLoader) + val instanceMirror = mirror.reflect(model) + val myMethod = ju.typeOf[Word2VecModel].decl(ju.TermName("wordVectors")).asMethod + val myFunc = instanceMirror.reflectMethod(myMethod) + val mllibModel = myFunc().asInstanceOf[mllibWord2VecModel] + + val metric = params.eval match { + case "taobao" => Word2VecEvaluation.evaluateTaobaoProductCTR(spark, downstreamTrainFile, downstreamTestFile, mllibModel, params.numPartitions) + case "alibaba" => Word2VecEvaluation.evaluateAlibabaCTR(spark, Array(fieldName), downstreamTrainFile, downstreamTestFile, Array(mllibModel), params.numPartitions) + } + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + println(s"********** metric=${metric} **********") + + params.setCostTime(costTime) + params.setMetric(metric) + + val folder = new File("report") + if (!folder.exists()) { + val mkdir = folder.mkdirs() + println(s"Create dir report ${mkdir}") + } + val writer = new FileWriter(s"report/Word2Vec_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/ml/XGBTRunner.scala b/tools/kal-test/src/main/scala/com/bigdata/ml/XGBTRunner.scala new file mode 100644 index 0000000..cef99dd --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/ml/XGBTRunner.scala @@ -0,0 +1,229 @@ +package com.bigdata.ml + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.UpEvaluationVerify +import com.bigdata.compare.ml.DownEvaluationVerify + +import ml.dmlc.xgboost4j.scala.spark.{XGBoostClassifier, XGBoostRegressor} +import org.apache.spark.SparkConf +import org.apache.spark.ml.evaluation.{MulticlassClassificationEvaluator, RegressionEvaluator} +import org.apache.spark.sql.{Dataset, Row, SparkSession} +import org.apache.spark.storage.StorageLevel +import org.yaml.snakeyaml.{DumperOptions,TypeDescription,Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.util +import java.io.{File, FileWriter, PrintWriter, StringWriter} +import scala.beans.BeanProperty + +class XGBTConfig extends Serializable{ + @BeanProperty var xgbt: util.HashMap[String, util.HashMap[String, util.HashMap[String, util.HashMap[String, util.HashMap[String,Object]]]]]=_ +} +class XGBTParams extends Serializable{ + @BeanProperty var eta: Double = _ + @BeanProperty var gamma: Double = _ + @BeanProperty var min_child_weight: Int = _ + @BeanProperty var max_depth: Int = _ + @BeanProperty var allow_non_zero_for_missing: Boolean = _ + @BeanProperty var vectorType: String = _ + @BeanProperty var enable_bbgen: Boolean = _ + @BeanProperty var rabit_enable_tcp_no_delay: Boolean = _ + @BeanProperty var objective: String = _ + @BeanProperty var num_round: Int = _ + @BeanProperty var num_workers: Int = _ + @BeanProperty var nthread: Int = _ + @BeanProperty var tree_method: String = _ + @BeanProperty var grow_policy: String = _ + + @BeanProperty var algorithmName: String = _ + @BeanProperty var algorithmType: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var mode: String = _ + @BeanProperty var verbosity: Int = _ + @BeanProperty var tr_fname: String = _ + @BeanProperty var ts_fname: String = _ + @BeanProperty var num_class: Int = _ + @BeanProperty var evaluation: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} +object XGBTRunner { + def main(args: Array[String]): Unit = { + try{ + val modelConf = args(0) + val modelConfSplit = modelConf.split("-") + val (algorithmType, datasetName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2), modelConfSplit(3)) + val dataPath = args(1) + val dataPathSplit = dataPath.split(",") + val (trainDataPath,testDataPath)=(dataPathSplit(0),dataPathSplit(1)) + val cpuName = args(2) + val sparkConfSplit = args(3).split("_") + val (master,deployMode)=(sparkConfSplit(0), sparkConfSplit(1)) + val saveResultPath = args(4) + + val stream = Utils.getStream("conf/ml/xgbt/xgbt.yml") + val representer =new Representer + representer.addClassTag(classOf[XGBTParams],Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[XGBTConfig]),representer,options) + val description = new TypeDescription(classOf[XGBTParams]) + yaml.addTypeDescription(description) + val configs: XGBTConfig = yaml.load(stream).asInstanceOf[XGBTConfig] + val params = new XGBTParams() + val paramsMap: util.HashMap[String,Object] = configs.xgbt.get(cpuName).get(isRaw match { + case "no" => "opt" + case "yes" => "raw" + }).get(algorithmType).get(datasetName) + params.setEta(paramsMap.get("eta").asInstanceOf[Double]) + params.setGamma(paramsMap.get("gamma").asInstanceOf[Int]) + params.setMin_child_weight(paramsMap.get("min_child_weight").asInstanceOf[Int]) + params.setMax_depth(paramsMap.get("max_depth").asInstanceOf[Int]) + params.setAllow_non_zero_for_missing(paramsMap.get("allow_non_zero_for_missing").asInstanceOf[Boolean]) + params.setVectorType(paramsMap.get("vectorType").asInstanceOf[String]) + params.setEnable_bbgen(paramsMap.get("enable_bbgen").asInstanceOf[Boolean]) + params.setRabit_enable_tcp_no_delay(paramsMap.get("rabit_enable_tcp_no_delay").asInstanceOf[Boolean]) + params.setObjective(paramsMap.get("objective").asInstanceOf[String]) + params.setNum_round(paramsMap.get("num_round").asInstanceOf[Int]) + params.setNum_workers(paramsMap.get("num_workers").asInstanceOf[Int]) + params.setNthread(paramsMap.get("nthread").asInstanceOf[Int]) + params.setTree_method(paramsMap.get("tree_method").asInstanceOf[String]) + params.setGrow_policy(paramsMap.get("grow_policy").asInstanceOf[String]) + params.setAlgorithmType(algorithmType) + params.setTr_fname(trainDataPath) + params.setTs_fname(testDataPath) + params.setIsRaw(isRaw) + params.setDatasetName(datasetName) + params.setIfCheck(ifCheck) + params.setAlgorithmName("XGBT") + params.setSaveDataPath(s"${saveResultPath}/${params.algorithmName}/${algorithmType}_${datasetName}") + params.setVerifiedDataPath(s"${params.saveDataPath}_raw") + if(s"${algorithmType}_${datasetName}"=="classification_mnist8m"){ + params.setNum_class(paramsMap.get("num_class").asInstanceOf[Int]) + } + var appName = s"${params.algorithmName}_${algorithmType}_${datasetName}" + if (isRaw.equals("yes")){ + appName = s"${params.algorithmName}_${algorithmType}_${datasetName}_raw" + params.setVerifiedDataPath(params.saveDataPath) + params.setSaveDataPath(s"${params.saveDataPath}_raw") + } + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName).setMaster(master) + val commonParas =Array( + ("spark.submit.deployMode",deployMode) + ) + conf.setAll(commonParas) + val spark= SparkSession.builder().config(conf).getOrCreate() + val (res, costTime) = new XGBTKernel().runJob(spark, params) + params.setEvaluation(res) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if(ifCheck.equals("yes")){ + val isCorrect = params.algorithmType match { + case "classification" => UpEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark) + case "regression" => DownEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark) + } + params.setIsCorrect(isCorrect) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${costTime}s; evaluation: ${res};isCorrect: ${params.isCorrect}") + }catch { + case e: Throwable=> + println(s"Exec Failure: ${e.getMessage}") + throw e + val sw: StringWriter = new StringWriter() + val pw: PrintWriter = new PrintWriter(sw) + e.printStackTrace(pw) + println("=============>>printStackTraceStr Exception: " + e.getClass + "\n===>" + sw.toString) + } + } +} + +class XGBTKernel { + def runJob(spark: SparkSession, params: XGBTParams): (Double, Double) = { + val sc = spark.sparkContext + var paramsAnyMap: scala.Predef.Map[String,Any]=Map[String,Any]() + paramsAnyMap += ("eta" -> params.getEta) + paramsAnyMap += ("gamma" -> params.getGamma) + paramsAnyMap += ("min_child_weight" -> params.getMin_child_weight) + paramsAnyMap += ("max_depth" -> params.getMax_depth) + paramsAnyMap += ("allow_non_zero_for_missing" -> params.getAllow_non_zero_for_missing) + paramsAnyMap += ("vectorType" -> params.getVectorType) + paramsAnyMap += ("objective" -> params.getObjective) + paramsAnyMap += ("num_round" -> params.getNum_round) + paramsAnyMap += ("num_workers" -> params.getNum_workers) + paramsAnyMap += ("nthread" -> params.getNthread) + paramsAnyMap += ("tree_method" -> params.getTree_method) + paramsAnyMap += ("grow_policy" -> params.getGrow_policy) + if(s"${params.algorithmType}_${params.datasetName}"=="classification_mnist8m") { + paramsAnyMap += ("num_class" -> params.getNum_class) + } + if (params.isRaw == "no"){ + paramsAnyMap += ("enable_bbgen" -> params.getEnable_bbgen) + paramsAnyMap += ("rabit_enable_tcp_no_delay" -> params.getRabit_enable_tcp_no_delay) + } + + val start_time = System.currentTimeMillis() + val XGBTrain = params.algorithmType match { + case "classification" => + new XGBoostClassifier(paramsAnyMap).setLabelCol("label").setFeaturesCol("features") + case "regression" => + new XGBoostRegressor(paramsAnyMap).setLabelCol("label").setFeaturesCol("features") + } + + val train_data = getTrainData(spark,params).persist(StorageLevel.MEMORY_AND_DISK_SER) + + val model = XGBTrain.fit(train_data) + + val fitEndTime = System.currentTimeMillis() + val test_data = getTestData(spark,params).persist(StorageLevel.MEMORY_AND_DISK_SER) + val predictions = model.transform(test_data) + + val evaluator = params.algorithmType match { + case "classification" => + new MulticlassClassificationEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("accuracy") + case "regression" => + new RegressionEvaluator().setLabelCol("label").setPredictionCol("prediction").setMetricName("rmse") + } + val res = evaluator.evaluate(predictions) + println(s"Test Error = ${(1.0-res)}") + val costTime = (fitEndTime-start_time)/1000.0 + predictions.select("prediction","label","features").show(5) + Utils.saveEvaluation(res, params.saveDataPath, sc) + (res, costTime) + } + + def getTrainData(spark:SparkSession,config:XGBTParams): Dataset[Row] ={ + val tr_fname = config.getTr_fname + println("tr_fname",tr_fname) + var reader = spark.read.format("libsvm").option("vectorType",config.getVectorType) + val tr_data = reader.load(tr_fname) + tr_data + } + def getTestData(spark: SparkSession, config: XGBTParams): Dataset[Row] = { + val ts_fname = config.getTs_fname + println("ts_fname" , ts_fname) + var reader= spark.read.format("libsvm").option("vectorType",config.getVectorType) + val ts_data = reader.load(ts_fname) + ts_data + } + +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/bigdata/preprocess/graph/IncDataGeneratorBatch.scala b/tools/kal-test/src/main/scala/com/bigdata/preprocess/graph/IncDataGeneratorBatch.scala new file mode 100644 index 0000000..3ea2049 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/preprocess/graph/IncDataGeneratorBatch.scala @@ -0,0 +1,115 @@ +package com.bigdata.preprocess.graph + +import com.bigdata.graph.Util + +import org.apache.spark.graphx.lib.TrillionPageRank +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{Row, SaveMode, SparkSession} +import org.apache.spark.storage.StorageLevel +import org.apache.spark.{HashPartitioner, SparkConf} + +object IncDataGeneratorBatch { + def main(args: Array[String]): Unit = { + val host = args(0) // local or yarn + val inputPath = args(1) + val split = args(2) + val outputPath = args(3) + val rate = args(4).toDouble + val partition = args(5).toInt + val seed = args(6).toLong + val iterNum = args(7).toInt + val resetProb = args(8).toDouble + val batchNum = args(9).toInt + + val sparkConf = new SparkConf().setAppName(s"IncDataGenerator-${rate}_batch_${batchNum}").setMaster(host) + sparkConf.set("spark.sql.orc.impl", "native") + sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") + sparkConf.set("spark.rdd.compress", "true") + val spark = SparkSession.builder().config(sparkConf).getOrCreate() + val sc = spark.sparkContext + sc.setLogLevel("WARN") + + val schema = StructType( + Seq( + StructField("srcId", LongType, true), + StructField("srcStatus", IntegerType, true), + StructField("dstId", ArrayType(LongType), true), + StructField("dstStatus", ArrayType(IntegerType), true), + StructField("pr", DoubleType, true) + ) + ) + val partitioner = new HashPartitioner(partition) + var allGraph = Util.readEdgeList(sc, inputPath, "\\s+", partition) + .partitionBy(partitioner) + .setName("batch-allGraph") + .persist(StorageLevel.MEMORY_ONLY_SER) + + for (bn <- 1 to batchNum reverse) { + val allNodes = allGraph.keys.union(allGraph.values).distinct().persist(StorageLevel.MEMORY_ONLY_SER) + val allSrcNodes = allGraph.keys.distinct().persist(StorageLevel.MEMORY_ONLY_SER) + val incNodesWithTag = allSrcNodes.sample(false, rate, seed) + .map(f => (f, 1)) + .persist(StorageLevel.MEMORY_ONLY_SER) + allNodes.unpersist(false) + allSrcNodes.unpersist(false) + + val nodeWithStatus = allGraph.leftOuterJoin(incNodesWithTag, partitioner).map(f => { + (f._2._1, (f._1, f._2._2.getOrElse(0))) + }).leftOuterJoin(incNodesWithTag, partitioner).map(f => { + (f._2._1._1, Array((f._2._1._2, f._1, f._2._2.getOrElse(0)))) + }) + .setName("nodeWithStatus") + .persist(StorageLevel.MEMORY_ONLY_SER) + nodeWithStatus.foreachPartition(f => {}) + allGraph.unpersist(false) + incNodesWithTag.unpersist(false) + + val incGraph0to1 = nodeWithStatus.filter(f => { + val a = f._2(0) + a._1 == 0 && a._3 == 1 + }) + val incGraph1to0 = nodeWithStatus.filter(f => { + val a = f._2(0) + a._1 == 1 && a._3 == 0 + }) + val incGraph1to1 = nodeWithStatus.filter(f => { + val a = f._2(0) + a._1 == 1 && a._3 == 1 + }) + + val srcLinks = nodeWithStatus.reduceByKey(_++_).mapValues(f => { + val srcStatus = f(0)._1 + val dstArr = f.map(x => x._2) + val dstStatusArr = f.map(x => x._3) + (srcStatus, dstArr, dstStatusArr) + }) + .setName("srcLinks") + .persist(StorageLevel.MEMORY_ONLY_SER) + srcLinks.foreachPartition(f => {}) + nodeWithStatus.unpersist(false) + val orgGraph = srcLinks.filter(f => f._2._1 == 0).map(f => { + val dstArr = f._2._2.zip(f._2._3) + val dstMap = collection.mutable.Map(dstArr: _ *) + val dstIncMap = dstMap.retain((k, v) => (v == 0)) + (f._1, (1.0, dstIncMap.keys.toArray)) + }).filter(f => f._2._2.nonEmpty) + .setName("orgGraph") + .persist(StorageLevel.MEMORY_ONLY_SER) + orgGraph.foreachPartition(f => {}) + + val orgEges = orgGraph.map(f => f._2._2.length) + val orgPr = TrillionPageRank.run(orgGraph, partition, iterNum, resetProb, false) + val incData = srcLinks.leftOuterJoin(orgPr, partitioner).map(f => + Row(f._1, f._2._1._1, f._2._1._2, f._2._1._3, f._2._2.getOrElse(1.0)) + ).setName("incData") + .persist(StorageLevel.MEMORY_ONLY_SER) + incData.foreachPartition(f => {}) + srcLinks.unpersist(false) + orgPr.unpersist(false) + + val incDataDF = spark.createDataFrame(incData, schema) + incDataDF.write.option("header", "true").mode(SaveMode.Overwrite).orc(outputPath + s"_batch_${bn}") + allGraph = orgGraph.map(f => (f._1, f._2._2)).flatMapValues(f => f) + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/preprocess/graph/TrillionPageRankDataProcess.scala b/tools/kal-test/src/main/scala/com/bigdata/preprocess/graph/TrillionPageRankDataProcess.scala new file mode 100644 index 0000000..e670932 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/preprocess/graph/TrillionPageRankDataProcess.scala @@ -0,0 +1,18 @@ +package com.bigdata.preprocess.graph + +import org.apache.spark.{SparkConf, SparkContext} + +object TrillionPageRankDataProcess { + def main(args: Array[String]): Unit = { + val input = args(0) + val output = args(1) + val sparkConf = new SparkConf().setMaster("yarn") + val sc = new SparkContext(sparkConf) + val data = sc.textFile(input, 300).map(f => { + val tmp = f.split("\t") + (tmp(0), Array(tmp(1))) + }) + val adj = data.reduceByKey(_ ++ _) + adj.map(f => f._1 + "_1.0," + f._2.mkString(",")).saveAsTextFile(output) + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/preprocess/ml/EncoderDataGenRun.scala b/tools/kal-test/src/main/scala/com/bigdata/preprocess/ml/EncoderDataGenRun.scala new file mode 100644 index 0000000..53a7145 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/preprocess/ml/EncoderDataGenRun.scala @@ -0,0 +1,184 @@ +package com.bigdata.preprocess.ml + +import org.json4s.jackson.Serialization +import org.json4s.DefaultFormats +import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} +import org.apache.spark.sql.catalyst.util.RandomUUIDGenerator +import org.apache.spark.sql.functions.{col, concat_ws, udf} + +import java.io.{BufferedWriter, File, FileOutputStream, OutputStreamWriter} +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer + +class DataAndMapGen { + + var mapPath = "" + var dataPath = "" + + def setMapPath(mapPath: String): this.type = { + this.mapPath = mapPath + this + } + + def setDataPath(dataPath: String): this.type = { + this.dataPath = dataPath + this + } + + var featureMap:mutable.Map[String,Int] = mutable.Map.empty + + def dataGenFunc(numSamples: Int, spark: SparkSession): Dataset[Row] = { + val gen = RandomUUIDGenerator(0L) + println("Creating dataset") + val rdd = spark.sparkContext.parallelize(0 until numSamples) + .map(_ => { + var row = mutable.ArrayBuffer[String]() + val uid = gen.getNextUUIDUTF8String().toString.replace("-", "") + for (strLen <- Array(2, 4, 4)) { + for (listLen <- Array(1, 1, 1, 1, 1)) { + var offset = 0 + val list = mutable.ArrayBuffer[String]() + for (_ <- 0 until listLen) { + list.append(uid.slice(offset, offset + strLen)) + offset += 1 + } + row.append(list.mkString("^")) + } + } + + (row(0), row(1), row(2), row(3), row(4), row(5), row(6), row(7), row(8), row(9), + row(10), row(11), row(12), row(13), row(14)) + }) + + rdd.saveAsTextFile(dataPath) + println("save data done") + + import spark.implicits._ + val df = spark.sparkContext.textFile(dataPath).map{ + t => + val row = t.split(",") + (row(0), row(1), row(2), row(3), row(4), row(5), row(6), row(7), row(8), row(9), + row(10), row(11), row(12), row(13), row(14)) + }.toDF("1xxx","2xxx","3xxx","4xxx","5xxx","6xxx","7xxx","8xxx","9xxx","10xxx","11xxx","12xxx","13xxx","14xxx","15xxx") + .repartition(800) + .cache() + + df.show() + df + } + + def updateFeatureMap(input: DataFrame, key:String, value:String): Unit={ + val kvMap = input + .select(key,value) + .rdd + .map(x=>(x.get(0).toString,x.get(1).toString.toDouble.toInt+1)) + .collectAsMap() + featureMap ++= kvMap + } + + def saveMutableMap(mapFile:Any,savePath:String):Unit={ + implicit val formats: DefaultFormats = org.json4s.DefaultFormats + val map = mapFile.asInstanceOf[mutable.Map[String,Any]] + val data = Serialization.write(map) + + val outputFileName = savePath + val outputDir = outputFileName.split("\\/").dropRight(1).mkString("/") + + println(s"file will be save at ${outputFileName}") + val file = new File(outputDir) + if (!file.exists()) { + //如果文件夹不存在 + val mkdirsFlag = file.mkdirs() //创建文件夹 + if (mkdirsFlag == true) { + println(s"create dir successed:${outputDir}") + } + } + var writer: BufferedWriter = null + try { + val fileName = new File(outputFileName) + if (fileName.exists()){ + val deleteFlg = fileName.delete() + if (deleteFlg) + println("delete file successed") + } + writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName, true), "UTF-8")) + writer.write(data) + } finally { + if (writer != null) writer.close() + } + } + + def mapGenFunc(spark: SparkSession, input: DataFrame, inputCols: String*): Unit = { + val crossUDF = udf((featOne:String,featTwo:String)=>{ + val featureArrayBuf = ArrayBuffer[String]() + for (i <- featOne.split("\\^")){ + for (j <- featTwo.split("\\^")){ + featureArrayBuf.append(i+Const.ACTUAL_SEP_CHAR+j) + } + } + featureArrayBuf.mkString("^") + }) + for (index <- 0 until inputCols.length) { + val cols = inputCols(index) + println(s"===================now is ${cols},completed ${index}/${inputCols.length - 1}") + val columnName = cols.replace(Const.CONFIG_CROSS_SEP_CHAR, Const.ACTUAL_SEP_CHAR) + var featureMapDF = if (cols.indexOf(Const.CONFIG_CROSS_SEP_CHAR) != (-1)) { + if (cols.split(Const.CONFIG_CROSS_SEP_CHAR).size > 2) { + input.withColumn(columnName, concat_ws(Const.ACTUAL_SEP_CHAR, cols.split(Const.CONFIG_CROSS_SEP_CHAR).map(x => col(x.trim())): _*)) + .select(columnName) + } else { + input.withColumn(columnName, crossUDF(cols.split(Const.CONFIG_CROSS_SEP_CHAR).map(x => col(x.trim())): _*)) + } + } else input.select(columnName) + + featureMapDF.na.drop(Array(columnName)).createOrReplaceTempView("featureMap") + featureMapDF = spark.sql(s"select ${columnName},row_number() over(order by cnt desc) as rank from (" + + s"select ${columnName},count(1) as cnt from featureMap group by ${columnName} having count(1)>0)t") + // featureMapDF = spark.sql(s"select ${columnName} from featureMap where length(${columnName})>0 group by ${columnName} having count(1)> ${minCount}") + import spark.implicits._ + // val featureMapRdd = featureMapDF.rdd.map(x => s"${columnName},"+x.get(0).toString).zipWithIndex().map(x => (x._1,x._2+2)) + val featureMapRdd = featureMapDF.rdd.map(x => (s"${columnName}," + x.get(0).toString, x.get(1).toString.toInt + 1)) + + featureMapDF = featureMapRdd.toDF("key", "value") + //append feature index map + updateFeatureMap(featureMapDF, "key", "value") + } + saveMutableMap(featureMap, mapPath) + } +} + +object Const { + val ACTUAL_SEP_CHAR = "__" + val CONFIG_CROSS_SEP_CHAR = "#" + val DENSE_MAP_SEQ_CHAR = "\\*" + val DENSE_MAP_SUFFIX = "_c" + private var MASK_RATE = 0.0 +} + +object EncoderDataGenRun { + def main(args: Array[String]): Unit = { + val isLocal = false + val sparkMaster = if (isLocal) "local[*]" else "yarn" + implicit val spark: SparkSession = SparkSession + .builder() + .appName("DateGen") + .master(sparkMaster) + .getOrCreate() + spark.sparkContext.setLogLevel("ERROR") + + var mapPath = "" + var dataPath = "" + var numSamples = 400000000 + + args.sliding(2, 2).foreach { + case Array("--mapPath", value) => mapPath = value + case Array("--dataPath", value) => dataPath = value + case Array("--numSamples", value) => numSamples = value.toInt + case Array(name, value) => System.err.println(s"[ERROR] unknown argument, name:$name, value:$value") + } + + val dg = new DataAndMapGen().setDataPath(dataPath).setMapPath(mapPath) + val df = dg.dataGenFunc(numSamples, spark) + dg.mapGenFunc(spark, df, df.columns:_*) + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/utils/DTBucketUtils.scala b/tools/kal-test/src/main/scala/com/bigdata/utils/DTBucketUtils.scala new file mode 100644 index 0000000..86be372 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/utils/DTBucketUtils.scala @@ -0,0 +1,35 @@ +package com.bigdata.utils + +import org.apache.spark.ml.linalg.SparseVector +import org.apache.spark.mllib.tree.configuration.FeatureType +import org.apache.spark.mllib.tree.model.Node + +object DTBucketUtils extends Serializable { + def getLeafNodes(node: Node): Array[Int] = { + var treeLeafNodes = new Array[Int](0) + if (node.isLeaf) { + treeLeafNodes = treeLeafNodes.:+(node.id) + } else { + treeLeafNodes = treeLeafNodes ++ getLeafNodes(node.leftNode.get) + treeLeafNodes = treeLeafNodes ++ getLeafNodes(node.rightNode.get) + } + treeLeafNodes + } + + def predictModify(node: Node, features: SparseVector): Int = { + val split = node.split + if (node.isLeaf) { + node.id + } else if (split.get.featureType == FeatureType.Continuous) { + if (features(split.get.feature) <= split.get.threshold) { + predictModify(node.leftNode.get, features) + } else { + predictModify(node.rightNode.get, features) + } + } else if (split.get.categories.contains(features(split.get.feature))) { + predictModify(node.leftNode.get, features) + } else { + predictModify(node.rightNode.get, features) + } + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/utils/TimeUtils.scala b/tools/kal-test/src/main/scala/com/bigdata/utils/TimeUtils.scala new file mode 100644 index 0000000..055a737 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/utils/TimeUtils.scala @@ -0,0 +1,48 @@ +package com.bigdata.utils + +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer + +object TimeUtils { + val NANO_TO_SEC = 1000000000.0 + + class Timer (name: String = "Perf") { + val ts = mutable.Map[String, ArrayBuffer[Long]]() + var prev: Long = 0L + var reportOrder = ArrayBuffer[String]() + var startTime = prev + + def start(): this.type = { + prev = System.nanoTime() + startTime = prev + this + } + + def tic(tag: String): Unit = { + if (!reportOrder.contains(tag)) {reportOrder.append(tag)} + val curr = System.nanoTime() + if (ts.contains(tag)) { + ts(tag).append(curr - prev) + } else { ts.update(tag, ArrayBuffer(curr - prev)) } + prev = curr } + + def report(): Unit = { + for (tag <- reportOrder) { + println(s"[${name}] ${tag}: ${ts(tag).sum / NANO_TO_SEC} s") + } + println(s"[${name}] Total: ${(prev - startTime) / NANO_TO_SEC}") + } + + def getElapsedTime(): Double = { (prev - startTime) / NANO_TO_SEC } + } + + def time[R](block: => R): R = { + val t0 = System.nanoTime() + val result = block // call-by-name + val t1 = System.nanoTime() + println("*******************************") + println("Elapsed time: " + (t1 - t0) * 1.0 / 1000000000 + "s") + println("*******************************") + result + } +} diff --git a/tools/kal-test/src/main/scala/com/bigdata/utils/Utils.scala b/tools/kal-test/src/main/scala/com/bigdata/utils/Utils.scala new file mode 100644 index 0000000..e13e3a3 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/utils/Utils.scala @@ -0,0 +1,156 @@ +package com.bigdata.utils + +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.spark.sql.SparkSession +import org.apache.spark.mllib.linalg._ +import org.apache.spark.rdd.RDD + +import java.io.{File, FileInputStream, InputStreamReader, PrintWriter} +import java.nio.charset.StandardCharsets +import java.text.SimpleDateFormat +import java.util.{Date, TimeZone} +import java.nio.file.{Files, Paths} +import scala.io.Source + + +object Utils { + /** + * + * @param filename The resource name + * @return + */ + + def getStream(filename: String): InputStreamReader = { + + val file = new File(filename) + if (!file.exists() || file.isDirectory) { + throw new Exception(s"Fail to find prorerty file[${file}]") + } + val inputStreamReader = new InputStreamReader(new FileInputStream(file), StandardCharsets.UTF_8) + + inputStreamReader + + } + + /** + * + * @param dateFmt date format + * @param utcMilliseconds "yyyy-MM-dd HH:mm:ss" + * @return String date + */ + def getDateStrFromUTC(dateFmt: String, utcMilliseconds: Long): String = { + val sf = new SimpleDateFormat(dateFmt) + sf.setTimeZone(TimeZone.getTimeZone("Asia/Shanghai")) + sf.format(new Date(utcMilliseconds)) + } + + def checkDirs(dirName: String): Unit ={ + val folder = new File(dirName) + if (!folder.exists()) { + val mkdir = folder.mkdirs() + println(s"Create dir report ${mkdir}") + } + } + + def saveEvaluation(res: Double, savePath: String, sc: SparkContext): Unit ={ + val result = new Array[String](1) + result(0) = res.toString + val fs = FileSystem.get(sc.hadoopConfiguration) + val saveFile = new Path(savePath) + if (fs.exists(saveFile)) { + fs.delete(saveFile, true) + } + sc.parallelize(result).repartition(1).saveAsTextFile(savePath) + } + + def saveLDARes(res: BigDecimal, savePath: String, sc: SparkContext): Unit ={ + val result = new Array[String](1) + result(0) = res.toString + val fs = FileSystem.get(sc.hadoopConfiguration) + val saveFile = new Path(savePath) + if (fs.exists(saveFile)) { + fs.delete(saveFile, true) + } + sc.parallelize(result).repartition(1).saveAsTextFile(savePath) + } + + def compareDoubleResults(saveDataPath: String, verifiedDataPath: String): String = { + if(Files.exists(Paths.get(verifiedDataPath))){ + val saveFile = Source.fromFile(saveDataPath) + val verifiedFile = Source.fromFile(verifiedDataPath) + val pri = saveFile.getLines().toArray + val opt = verifiedFile.getLines().toArray + saveFile.close() + verifiedFile.close() + if (math.abs(pri(0).toDouble - opt(0).toDouble) / pri(0).toDouble <= 0.005) { + return "correct" + } + else { + return "incorrect" + } + }else{ + return "invaildComparison" + } + } + + /** + * Convert DenseMatrix to 2-dimension array, stored in row major + * @param matrix Input matrix + * @return 2-dimension array, stored in row major + */ + def toRowMajorArray(matrix: DenseMatrix): Array[Array[Double]] = { + val nRow = matrix.numRows + val nCol = matrix.numCols + val arr = new Array[Array[Double]](nRow).map(_ => new Array[Double](nCol)) + if(matrix.isTransposed){ + var srcOffset = 0 + for{i <- 0 until nRow} { + System.arraycopy(matrix.values, srcOffset, arr(i), 0, nCol) + srcOffset += nCol + } + } else { + matrix.values.indices.foreach(idx => { + val j = math.floor(idx / nRow).toInt + val i = idx % nRow + arr(i)(j) = matrix.values(idx) + }) + } + arr + } + + def writeMatrix(mat: DenseMatrix, path: String): Unit = { + val writer =new PrintWriter(path) + val arr = toRowMajorArray(mat) + arr.foreach(vec => writer.write(vec.mkString(",") + "\n")) + writer.close() + } + + def writeVector(vector: DenseVector, path: String): Unit = { + val writer =new PrintWriter(path) + vector.values.foreach(d => writer.write(d + "\n")) + writer.close() + } + + def readMatrix(path: String): Array[Array[Double]] = { + val file = Source.fromFile(path) + val arr = file.getLines().map(line => line.split(",").map(_.toDouble)).toArray + file.close() + arr + } + + def isEqualMatrix(opensourceMatrix: Array[Array[Double]], boostkitMatrix: Array[Array[Double]], tol: Double = 1e-6): Boolean = { + if(opensourceMatrix.length != boostkitMatrix.length) + return false + for(i <- boostkitMatrix.indices) { + if(opensourceMatrix(i).length != boostkitMatrix(i).length) + return false + for(j <- opensourceMatrix(i).indices) { + if(math.abs(math.abs(opensourceMatrix(i)(j)) - math.abs(boostkitMatrix(i)(j))) > tol) + return false + } + } + true + } + +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/AngelBayesianOptimization.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/AngelBayesianOptimization.scala new file mode 100644 index 0000000..4259698 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/AngelBayesianOptimization.scala @@ -0,0 +1,377 @@ +// scalastyle:off +package com.tencent.angel.spark.automl + +import com.bigdata.utils.Utils +import com.bigdata.compare.ml.UpEvaluationVerify + +import com.tencent.angel.spark.automl.tuner.config.Configuration +import com.tencent.angel.spark.automl.tuner.parameter.ParamSpace +import com.tencent.angel.spark.automl.tuner.solver.Solver +import org.apache.spark.SparkConf +import org.apache.spark.ml.classification.{GBTClassifier, LogisticRegression, RandomForestClassifier} +import org.apache.spark.ml.evaluation.{BinaryClassificationEvaluator, RegressionEvaluator} +import org.apache.spark.ml.feature.{LabeledPoint, OneHotEncoder, SQLTransformer, StringIndexer, VectorAssembler} +import org.apache.spark.ml.regression.RandomForestRegressor +import org.apache.spark.ml.{Model, Pipeline} +import org.apache.spark.mllib.evaluation.RegressionMetrics +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.functions.{abs, coalesce, col, hash, lit, mean} +import org.apache.spark.sql.types.{DoubleType, StringType, StructField, StructType} +import org.apache.spark.ml.linalg.Vectors +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.io.FileWriter +import java.util +import scala.beans.BeanProperty +import scala.util.Random + +class ABOConfig extends Serializable { + @BeanProperty var bo: util.HashMap[String, util.HashMap[String, util.HashMap[String, Object]]] = _ +} + +class ABOParams extends Serializable { + @BeanProperty var partitionNum: Int = _ + + @BeanProperty var dataPath: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var evaluation: Double = _ + @BeanProperty var costTime: Double = _ + @BeanProperty var cpuName: String = _ + @BeanProperty var isRaw: String = _ + @BeanProperty var startTime: Long = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ + @BeanProperty var saveDataPath: String = _ + @BeanProperty var verifiedDataPath: String = _ + @BeanProperty var ifCheck: String = _ + @BeanProperty var isCorrect: String = _ +} + +object AngelBayesianOptimization { + def main(args: Array[String]): Unit = { + try { + val modelConfSplit = args(0).split("-") + val (datasetName, isRaw, ifCheck) = + (modelConfSplit(0), modelConfSplit(1), modelConfSplit(2)) + val dataPath = args(1) + val cpuName = args(2) + val saveResultPath = args(3) + + val stream = Utils.getStream("conf/ml/bo/bo.yml") + val representer = new Representer + representer.addClassTag(classOf[ABOParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[ABOConfig]), representer, options) + val description = new TypeDescription(classOf[ABOParams]) + yaml.addTypeDescription(description) + val configs: ABOConfig = yaml.load(stream).asInstanceOf[ABOConfig] + val paramsMap: util.HashMap[String, Object] = configs.bo.get("raw").get(datasetName) + val params = new ABOParams() + params.setPartitionNum(paramsMap.getOrDefault("partitionNum", "1000").asInstanceOf[Int]) + params.setDataPath(dataPath) + params.setDatasetName(datasetName) + params.setCpuName(cpuName) + params.setIsRaw(isRaw) + params.setIfCheck(ifCheck) + params.setAlgorithmName("BO") + params.setVerifiedDataPath(s"${saveResultPath}/${params.algorithmName}/${datasetName}") + params.setSaveDataPath(s"${params.verifiedDataPath}_raw") + var appName = s"${params.algorithmName}_${datasetName}_raw" + params.setTestcaseType(appName) + + val conf = new SparkConf().setAppName(appName) + val spark = SparkSession.builder.config(conf).getOrCreate() + println(s"Initialized spark session.") + val startTime = System.currentTimeMillis() + params.setStartTime(startTime) + val sc = spark.sparkContext + + var searchArray:Array[Int] = Array() + var metricArray:Array[Double] = Array() + var timeArray:Array[Double] = Array() + for (a <- 1 to 10) { + val res = if (datasetName == "BostonHousing") { + bostonHousingRfRegressor(spark, params, a.toLong) + } else if (datasetName == "TitanicRf") { + titanicRf(spark, params, a.toLong) + } else if (datasetName == "TitanicGBT") { + titanicGBT(spark, params, a.toLong) + } else { + (0, 0.0, 0.0) + } + searchArray +:= res._1 + metricArray +:= res._2 + timeArray +:= res._3 + } + + val res = metricArray.sum * 1.0 / metricArray.length + val costTime = timeArray.sum * 1.0 / metricArray.length + println(searchArray.mkString(" ")) + println(searchArray.sum * 1.0 / metricArray.length) + println(metricArray.mkString(" ")) + println(res) + println(timeArray.mkString(" ")) + println(costTime) + params.setEvaluation(res) + params.setCostTime(costTime) + + Utils.checkDirs("report") + if (ifCheck.equals("yes")) { + params.setIsCorrect(UpEvaluationVerify.compareRes(params.saveDataPath, params.verifiedDataPath, spark)) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${params.testcaseType} ${params.isCorrect} \n") + writerIsCorrect.close() + } + + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + + println(s"Exec Successful: costTime: ${costTime}s; evaluation: ${res};isCorrect: ${params.isCorrect}") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } + + def bostonHousingRfRegressor(spark: SparkSession, params: ABOParams, seed:Long): (Int, Double, Double) ={ + val sc = spark.sparkContext + sc.setLogLevel("WARN") + val trainPath = params.dataPath + val partitionNum = params.partitionNum + val startTime = System.currentTimeMillis() + + val housingData = spark.read.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ") + .option("header", true).option("inferSchema", "true").csv(trainPath).repartition(partitionNum) + val features = housingData.drop("MEDV") + + val Array(trainingData, testData) = housingData.withColumnRenamed("MEDV", "label") + .randomSplit(Array(0.8, 0.2), seed = 42) + trainingData.persist() + testData.persist() + val regressor = new RandomForestRegressor() + + var paramList: Array[ParamSpace[Double]] = Array() + paramList :+= ParamSpace.fromConfigString(regressor.numTrees.toString(), "{3:30:1}", "Int") + paramList :+= ParamSpace.fromConfigString(regressor.maxDepth.toString(), "{2:20:1}", "Int") + paramList :+= ParamSpace.fromConfigString(regressor.subsamplingRate.toString(), "[0.5,1.0]", "Double") + paramList :+= ParamSpace.fromConfigString(regressor.minInfoGain.toString(), "[0,1.0]", "Double") + paramList :+= ParamSpace.fromConfigString(regressor.minInstancesPerNode.toString(), "{1,2,3,5,10,15,20}", "Int") + + val featureColumnsNames = features.columns.toArray + val assembler = new VectorAssembler() + .setInputCols(featureColumnsNames) + .setOutputCol("features") + + val pipeline = new Pipeline().setStages(Array(assembler, regressor)) + import org.apache.spark.ml.evaluation.RegressionEvaluator + + val cv = new AngelOptCrossValidator() + .setEstimator(pipeline) + .setEvaluator(new RegressionEvaluator().setMetricName("rmse")) + .setEstimatorParamSpace(paramList) + // .setSurrogate("RandomForest") + .setSurrogate("GaussianProcess") + .setNumIterations(500) + .setNumFolds(5) + .setParallelism(10) + .setThreshold(3.45) + .setSeed(seed) + + val model = cv.fit(trainingData) + println(cv.searchNumber) + println(cv.bestMetric) + trainingData.unpersist() + testData.unpersist() + val endTime = System.currentTimeMillis() + println((endTime - startTime)/1000.0) + (cv.searchNumber, cv.bestMetric, (endTime - startTime)/1000.0 ) + } + + def titanicRf(spark: SparkSession, params: ABOParams, seed:Long): (Int, Double, Double) ={ + val sc = spark.sparkContext + sc.setLogLevel("WARN") + val trainPath = params.dataPath + val partitionNum = params.partitionNum + val startTime = System.currentTimeMillis() + + var dataWithNulls = { + spark.read.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ").option("header", true).csv(trainPath) + .repartition(1) + .withColumn("survived", col("Survived").cast(DoubleType)) + .withColumn("age", col("Age").cast(DoubleType)) + .withColumn("siblings_spouses", col("SibSp").cast(DoubleType)) + .withColumn("parents_children", col("Parch").cast(DoubleType)) + .withColumn("fare", col("Fare").cast(DoubleType)) + .select(col("survived"), col("Name") as "passenger_name", col("Pclass") as "passenger_class", col("Sex") as "sex", + col("age"), col("fare"), col("siblings_spouses"), col("parents_children")) + .repartition(partitionNum) + } + val meanAge = dataWithNulls.select(mean("age")).first.getDouble(0) + val data = dataWithNulls.withColumn("age", coalesce(col("age"), lit(meanAge))).cache() + val titleTransformer = new SQLTransformer("title").setStatement( + s""" + |SELECT * + |, CASE WHEN passenger_name LIKE '%\\.%' THEN split(passenger_name, '\\\\.')[0] + | ELSE 'Nothing' + | END AS passenger_title + |FROM __THIS__ + """.stripMargin + ) + val categoricalCols = Array("passenger_class", "sex", "passenger_title") + val indexCols = categoricalCols.map(_ + "_index") + val oheCols = categoricalCols.map(_ + "_ohe") + val stringIndexers = categoricalCols.map(cc => { + new StringIndexer(s"string_indexer_$cc") + .setHandleInvalid("keep") + .setInputCol(cc) + .setOutputCol(cc + "_index") + }) + val oneHotEncoder = { + new OneHotEncoder("ohe") + .setHandleInvalid("keep") + .setDropLast(false) + .setInputCols(indexCols) + .setOutputCols(oheCols) + } + + val numericalCols = Array("age", "fare", "siblings_spouses", "parents_children") + val vectorAssembler = { + new VectorAssembler("vector_assembler") + .setInputCols(oheCols ++ numericalCols) + .setOutputCol("features") + } + + val rawClassifier = new RandomForestClassifier("rf") + .setFeaturesCol("features") + .setLabelCol("survived") + .setProbabilityCol("survival_prob") + .setRawPredictionCol("survival_raw_pred") + + val pipeline = new Pipeline("pipeline") + .setStages(Array(titleTransformer) ++ stringIndexers ++ Array(oneHotEncoder, vectorAssembler, rawClassifier)) + + var paramList: Array[ParamSpace[Double]] = Array() + paramList :+= ParamSpace.fromConfigString(rawClassifier.maxDepth.toString(), "{2:20:1}", "Int") + paramList :+= ParamSpace.fromConfigString(rawClassifier.numTrees.toString(), "{3:30:1}", "Int") + paramList :+= ParamSpace.fromConfigString(rawClassifier.minInfoGain.toString(), "[0,0.1]", "Double") + paramList :+= ParamSpace.fromConfigString(rawClassifier.subsamplingRate.toString(), "[0.6,1]", "Double") + + val cv = new AngelOptCrossValidator() + .setEstimator(pipeline) + .setEvaluator(new BinaryClassificationEvaluator() + .setLabelCol("survived") + .setRawPredictionCol("survival_raw_pred")) + .setEstimatorParamSpace(paramList) + .setSurrogate("RandomForest") + .setNumIterations(500) + .setThreshold(0.856) + .setNumFolds(5) + .setSeed(seed) + + val model = cv.fit(data) + println(cv.searchNumber) + println(cv.bestMetric) + data.unpersist() + val endTime = System.currentTimeMillis() + println((endTime - startTime)/1000.0) + (cv.searchNumber, cv.bestMetric, (endTime - startTime)/1000.0 ) + } + + def titanicGBT(spark: SparkSession, params: ABOParams, seed:Long): (Int, Double, Double) ={ + val sc = spark.sparkContext + sc.setLogLevel("WARN") + val trainPath = params.dataPath + val partitionNum = params.partitionNum + val startTime = System.currentTimeMillis() + + var dataWithNulls = { + spark.read.option("timestampFormat", "yyyy/MM/dd HH:mm:ss ZZ").option("header", true).csv(trainPath) + .repartition(1) + .withColumn("survived", col("Survived").cast(DoubleType)) + .withColumn("age", col("Age").cast(DoubleType)) + .withColumn("siblings_spouses", col("SibSp").cast(DoubleType)) + .withColumn("parents_children", col("Parch").cast(DoubleType)) + .withColumn("fare", col("Fare").cast(DoubleType)) + .select(col("survived"), col("Name") as "passenger_name", col("Pclass") as "passenger_class", col("Sex") as "sex", + col("age"), col("fare"), col("siblings_spouses"), col("parents_children")) + .repartition(partitionNum) + } + val meanAge = dataWithNulls.select(mean("age")).first.getDouble(0) + val data = dataWithNulls.withColumn("age", coalesce(col("age"), lit(meanAge))).cache() + val titleTransformer = new SQLTransformer("title").setStatement( + s""" + |SELECT * + |, CASE WHEN passenger_name LIKE '%\\.%' THEN split(passenger_name, '\\\\.')[0] + | ELSE 'Nothing' + | END AS passenger_title + |FROM __THIS__ + """.stripMargin + ) + val categoricalCols = Array("passenger_class", "sex", "passenger_title") + val indexCols = categoricalCols.map(_ + "_index") + val oheCols = categoricalCols.map(_ + "_ohe") + val stringIndexers = categoricalCols.map(cc => { + new StringIndexer(s"string_indexer_$cc") + .setHandleInvalid("keep") + .setInputCol(cc) + .setOutputCol(cc + "_index") + }) + val oneHotEncoder = { + new OneHotEncoder("ohe") + .setHandleInvalid("keep") + .setDropLast(false) + .setInputCols(indexCols) + .setOutputCols(oheCols) + } + + val numericalCols = Array("age", "fare", "siblings_spouses", "parents_children") + val vectorAssembler = { + new VectorAssembler("vector_assembler") + .setInputCols(oheCols ++ numericalCols) + .setOutputCol("features") + } + + val rawClassifier = new GBTClassifier() + .setFeaturesCol("features") + .setLabelCol("survived") + .setProbabilityCol("survival_prob") + .setRawPredictionCol("survival_raw_pred") + + val pipeline = new Pipeline("pipeline") + .setStages(Array(titleTransformer) ++ stringIndexers ++ Array(oneHotEncoder, vectorAssembler, rawClassifier)) + + var paramList: Array[ParamSpace[Double]] = Array() + paramList :+= ParamSpace.fromConfigString(rawClassifier.maxIter.toString(), "{3:10:1}", "Int") + paramList :+= ParamSpace.fromConfigString(rawClassifier.subsamplingRate.toString(), "[0.5,1.0]", "Double") + paramList :+= ParamSpace.fromConfigString(rawClassifier.minInfoGain.toString(), "[0.0,0.5]", "Double") + paramList :+= ParamSpace.fromConfigString(rawClassifier.maxDepth.toString(), "{3:10:1}", "Int") + + val cv = new AngelOptCrossValidator() + .setEstimator(pipeline) + .setEvaluator(new BinaryClassificationEvaluator() + .setLabelCol("survived") + .setRawPredictionCol("survival_raw_pred")) + .setEstimatorParamSpace(paramList) + .setSurrogate("GaussianProcess") + .setNumIterations(500) + .setThreshold(0.86) + .setNumFolds(4) + .setSeed(seed) + + val model = cv.fit(data) + println(cv.searchNumber) + println(cv.bestMetric) + data.unpersist() + val endTime = System.currentTimeMillis() + println((endTime - startTime)/1000.0) + (cv.searchNumber, cv.bestMetric, (endTime - startTime)/1000.0 ) + } +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/AngelOptCrossValidator.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/AngelOptCrossValidator.scala new file mode 100644 index 0000000..2958b07 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/AngelOptCrossValidator.scala @@ -0,0 +1,235 @@ +// scalastyle:off +package com.tencent.angel.spark.automl + +import com.tencent.angel.spark.automl.tuner.TunerParam +import com.tencent.angel.spark.automl.tuner.config.Configuration +import com.tencent.angel.spark.automl.tuner.parameter.ParamSpace +import com.tencent.angel.spark.automl.tuner.solver.Solver +import org.apache.hadoop.fs.Path +import org.apache.spark.SparkContext +import org.apache.spark.annotation.Experimental +import org.apache.spark.ml._ +import org.apache.spark.ml.evaluation.Evaluator +import org.apache.spark.ml.param._ +import org.apache.spark.ml.tuning.CrossValidator +import org.apache.spark.ml.util._ +import org.apache.spark.mllib.util.MLUtils +import org.apache.spark.sql.{DataFrame, Dataset, SQLContext} +import org.apache.spark.sql.types.StructType +import org.apache.spark.util.PublicThreadUtils +import org.json4s.{DefaultFormats, JObject, JValue} +import org.json4s.jackson.JsonMethods.{compact, parse, render} + +import scala.collection.JavaConverters._ +import scala.concurrent.duration.Duration +import scala.concurrent.{ExecutionContext, Future} + +private[spark] trait AngelOptCrossValidatorParams extends Params { + protected def transformSchemaImpl(schema: StructType): StructType = { + $(estimator).transformSchema(schema) + } + + val estimator: Param[Estimator[_]] = new Param(this, "estimator", "estimator for selection") + def getEstimator: Estimator[_] = $(estimator) + + val evaluator: Param[Evaluator] = new Param(this, "evaluator", + "evaluator used to select hyper-parameters that maximize the validated metric") + def getEvaluator: Evaluator = $(evaluator) + + val numFolds: IntParam = new IntParam( + this, + "numFolds", + "number of folds for cross validation (>= 2)", + ParamValidators.gtEq(2) + ) + + def getNumFolds: Int = $(numFolds) + + val numIterations: IntParam = new IntParam( + this, + "numIterations", + "number of cross validations to run (>= 2)", + ParamValidators.gtEq(2) + ) + + def getNumIterations: Int = $(numIterations) + + setDefault(numFolds -> 3) + setDefault(numIterations -> 10) + + final val seed: LongParam = new LongParam(this, "seed", "random seed") + + setDefault(seed, this.getClass.getName.hashCode.toLong) + + /** @group getParam */ + final def getSeed: Long = $(seed) + + protected final val surrogateName = new Param[String](this, "surrogateName", "tuning method, such as GaussianProcess, RandomForest, Random, Grid") + def getSurrogate: String = $(surrogateName) + setDefault(surrogateName -> "RandomForest") + + val estimatorParamSpace: Param[Array[ParamSpace[Double]]] = + new Param(this, "estimatorParamSpace", "hyper-parameters space for the estimator") + + def getEstimatorParamSpace: Array[ParamSpace[Double]] = $(estimatorParamSpace) + + val parallelism = new IntParam(this, "parallelism", + "the number of threads to use when running parallel algorithms", ParamValidators.gtEq(1)) + + val threshold = new DoubleParam(this, "threshold", + "threshold to stop working") + + setDefault(parallelism -> 1) + setDefault(threshold -> 0.0) + + val thresholdFlag = new BooleanParam(this, "thresholdFlag", + "flag for threshold judgment") + + /** @group getParam */ + def getThresholdFlag: Boolean = $(thresholdFlag) + + setDefault(thresholdFlag -> false) + + /** @group expertGetParam */ + def getParallelism: Int = $(parallelism) + + def getExecutionContext: ExecutionContext = { + getParallelism match { + case 1 => + PublicThreadUtils.utils.sameThread + case n => + ExecutionContext.fromExecutorService(PublicThreadUtils.utils + .newDaemonCachedThreadPool(s"${this.getClass.getSimpleName}-thread-pool", n)) + } + } +} + +class AngelOptCrossValidator (override val uid: String) + extends AngelOptCrossValidatorParams { + def this() = this(Identifiable.randomUID("cv")) + + /** @group setParam */ + def setEstimator(value: Estimator[_]): this.type = set(estimator, value) + + /** @group setParam */ + def setEvaluator(value: Evaluator): this.type = set(evaluator, value) + + /** @group setParam */ + def setNumFolds(value: Int): this.type = set(numFolds, value) + + /** @group setParam */ + def setNumIterations(value: Int): this.type = set(numIterations, value) + + def setSeed(value: Long): this.type = set(seed, value) + + /** @group setParam */ + def setSurrogate(value: String): this.type = set(surrogateName, value) + + /** @group setParam */ + def setEstimatorParamSpace(value: Array[ParamSpace[Double]]): this.type = set(estimatorParamSpace, value) + + def setParallelism(value: Int): this.type = set(parallelism, value) + + def setThreshold(value: Double): this.type = { + set(threshold, value) + set(thresholdFlag, true) + } + + def setThresholdFlag(value: Boolean): this.type = set(thresholdFlag, true) + + private def isStop(metric: Double): Boolean = { + if ($(evaluator).isLargerBetter && metric >= $(threshold)) { + true + } else if (!$(evaluator).isLargerBetter && metric <= $(threshold)) { + true + } else { + false + } + } + + var searchNumber: Int = 0 + var bestMetric: Double = 0.0 + + def fit(dataset: Dataset[_]): AngelOptCrossValidatorModel = { + val sqlContext = dataset.sqlContext + val schema = dataset.schema + val est = $(estimator) + val eval = $(evaluator) + val folds = $(numFolds) + val iterations = $(numIterations) + val paramList = $(estimatorParamSpace) + val executionContext = getExecutionContext + + val solver = Solver(paramList, !eval.isLargerBetter, $(surrogateName)) + + var stop = false + val observations = for { + iter <- (1 to iterations) if !stop + } yield { + val configs: Configuration = solver.suggest()(0) + val epm: ParamMap = configs.getParamMapWithParent + val splits = MLUtils.kFold(dataset.toDF.rdd, folds, $(seed)) + val accMetricsFuture = splits.map { case (training, validation) => Future[Double]{ + val trainingDataset = sqlContext.createDataFrame(training, schema) + val validationDataset = sqlContext.createDataFrame(validation, schema) + val models = est.fit(trainingDataset, epm) + eval.evaluate((models.asInstanceOf[Model[_]]).transform(validationDataset, epm)) + }(executionContext) + } + val accMetrics = accMetricsFuture.map(PublicThreadUtils.utils.awaitResult(_, Duration.Inf)) + val avgMetric: Double = (accMetrics).sum / (accMetrics).length + + solver.feed(configs, avgMetric) + + println(iter, configs.getVector, avgMetric) + + if ($(thresholdFlag)) { + stop = isStop(avgMetric) + } + + (epm, avgMetric) + } + val bestObservation = if (eval.isLargerBetter) observations.maxBy(_._2) else observations.minBy(_._2) + val bestModel = (est.fit(dataset, bestObservation._1)).asInstanceOf[Model[_]] + searchNumber = observations.length + bestMetric = bestObservation._2 + + copyValues(new AngelOptCrossValidatorModel(uid, bestModel, searchNumber, bestMetric)) + } + + override def copy(extra: ParamMap): AngelOptCrossValidator = { + val copied = defaultCopy(extra).asInstanceOf[AngelOptCrossValidator] + if (copied.isDefined(estimator)) { + copied.setEstimator(copied.getEstimator.copy(extra)) + } + if (copied.isDefined(evaluator)) { + copied.setEvaluator(copied.getEvaluator.copy(extra)) + } + if (copied.isDefined(numFolds)) { + copied.setNumFolds(copied.getNumFolds) + } + if (copied.isDefined(numIterations)) { + copied.setNumIterations(copied.getNumIterations) + } + copied + } +} + +class AngelOptCrossValidatorModel private[spark] ( + override val uid: String, + val bestModel: Model[_], + val searchNumber: Int, + val bestMetric: Double + ) extends Model[AngelOptCrossValidatorModel] with AngelOptCrossValidatorParams { + override def copy(extra: ParamMap): AngelOptCrossValidatorModel = { + val copied = new AngelOptCrossValidatorModel(uid, bestModel.copy(extra).asInstanceOf[Model[_]], searchNumber, bestMetric) + copyValues(copied, extra).setParent(parent) + } + override def transform(dataset: Dataset[_]): DataFrame = { + transformSchema(dataset.schema, true) + bestModel.transform(dataset) + } + override def transformSchema(schema: StructType): StructType = { + bestModel.transformSchema(schema) + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/TunerParam.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/TunerParam.scala new file mode 100644 index 0000000..462620a --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/TunerParam.scala @@ -0,0 +1,48 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner + +class TunerParam { +} + +object TunerParam { + + var batchSize: Int = 1 + var sampleSize: Int = 10 * batchSize + var defaultGridSize: Int = 100 + var index: Int = 0 + + var taskName: String = "com.tencent.angel.spark.automl.tuner.trail.TestRunner" + + def setBatchSize(num: Int): Unit = { + batchSize = num + } + + def setSampleSize(num: Int): Unit = { + sampleSize = num + } + + def setDefaultGridSize(num: Int): Unit = { + defaultGridSize = num + } + + def setTaskName(name: String): Unit = { + taskName = name + } +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/Acquisition.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/Acquisition.scala new file mode 100644 index 0000000..21f1c93 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/Acquisition.scala @@ -0,0 +1,38 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.acquisition + +import com.tencent.angel.spark.automl.tuner.surrogate.Surrogate +import org.apache.spark.ml.linalg.Vector + + +/** + * Abstract base class for acquisition function + */ +abstract class Acquisition(val surrogate: Surrogate) { + + /** + * Computes the acquisition value for a given point X + * + * @param X : (1, D), the input points where the acquisition function should be evaluated. + * @return (1, 1) Expected Improvement of X, (1, D) Derivative of Expected Improvement at X + */ + def compute(X: Vector, derivative: Boolean = false): (Double, Vector) + +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/EI.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/EI.scala new file mode 100644 index 0000000..3c146ca --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/EI.scala @@ -0,0 +1,65 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.acquisition + +import com.tencent.angel.spark.automl.tuner.surrogate.Surrogate +import org.apache.commons.logging.{Log, LogFactory} +import org.apache.commons.math3.distribution.NormalDistribution +import org.apache.spark.ml.linalg.{Vector, Vectors} + +/** + * Expected improvement. + * + * @param surrogate + * @param par : Controls the balance between exploration and exploitation of the acquisition function, default=0.0 + * + */ +class EI( + override val surrogate: Surrogate, + val par: Double) + extends Acquisition(surrogate) { + + val LOG: Log = LogFactory.getLog(classOf[Surrogate]) + + override def compute(X: Vector, derivative: Boolean = false): (Double, Vector) = { + val pred = surrogate.predict(X) // (mean, variance) + + // Use the best seen observation as incumbent + val eta: Double = surrogate.curBest._2 + //println(s"best seen result: $eta") + + val m: Double = pred._1 + val s: Double = Math.sqrt(pred._2) + //println(s"${X.toArray.mkString("(", ",", ")")}: mean[$m], variance[$s]") + + if (s == 0) { + // if std is zero, we have observed x on all instances + // using a RF, std should be never exactly 0.0 + (0.0, Vectors.dense(new Array[Double](X.size))) + } else { + val z = (pred._1 - eta - par) / s + val norm: NormalDistribution = new NormalDistribution + val cdf: Double = norm.cumulativeProbability(z) + val pdf: Double = norm.density(z) + val ei = s * (z * cdf + pdf) + //println(s"EI of ${X.toArray.mkString("(", ",", ")")}: $ei, cur best: $eta, z: $z, cdf: $cdf, pdf: $pdf") + (ei, Vectors.dense(new Array[Double](X.size))) + } + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/UCB.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/UCB.scala new file mode 100644 index 0000000..b4b349a --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/UCB.scala @@ -0,0 +1,63 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.acquisition + +import com.tencent.angel.spark.automl.tuner.surrogate.Surrogate +import org.apache.commons.logging.{Log, LogFactory} +import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.tuning.CrossValidator + +/** + * Expected improvement. + * + * @param surrogate + * @param beta : Controls the upper confidence bound + * Assume : + * - t: number of iteration + * - d: dimension of optimization space + * - v: hyperparameter v = 1 + * - delta: small constant 0.1 (prob of regret) + * Suggest value:beta = sqrt( v* (2* log( (t**(d/2. + 2))*(pi**2)/(3. * delta) ))) + */ +class UCB( + override val surrogate: Surrogate, + val beta: Double = 100) + extends Acquisition(surrogate) { + + val LOG: Log = LogFactory.getLog(classOf[Surrogate]) + + override def compute(X: Vector, derivative: Boolean = false): (Double, Vector) = { + CrossValidator + + val pred = surrogate.predict(X) // (mean, variance) + + val m: Double = pred._1 + val s: Double = Math.sqrt(pred._2) + + if (s == 0) { + // if std is zero, we have observed x on all instances + // using a RF, std should be never exactly 0.0 + (0.0, Vectors.dense(new Array[Double](X.size))) + } else { + val ucb = m + beta * s + + (ucb, Vectors.dense(new Array[Double](X.size))) + } + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/optimizer/AcqOptimizer.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/optimizer/AcqOptimizer.scala new file mode 100644 index 0000000..c85e23a --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/optimizer/AcqOptimizer.scala @@ -0,0 +1,43 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.acquisition.optimizer + +import com.tencent.angel.spark.automl.tuner.acquisition.Acquisition +import com.tencent.angel.spark.automl.tuner.config.{Configuration, ConfigurationSpace} + +/** + * Abstract base class for acquisition maximization. + * + * @param acqFunc : The acquisition function which will be maximized + * @param configSpace : Configuration space of parameters + */ +abstract class AcqOptimizer( + val acqFunc: Acquisition, + val configSpace: ConfigurationSpace) { + + /** + * Maximizes the given acquisition function. + * + * @param numPoints : Number of queried points. + * @return A set of tuple(acquisition value, Configuration). + */ + def maximize(numPoints: Int, sorted: Boolean = true): Array[(Double, Configuration)] + + def maximize: (Double, Configuration) +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/optimizer/LocalSearch.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/optimizer/LocalSearch.scala new file mode 100644 index 0000000..d61ede1 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/optimizer/LocalSearch.scala @@ -0,0 +1,49 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.acquisition.optimizer + +import com.tencent.angel.spark.automl.tuner.acquisition.Acquisition +import com.tencent.angel.spark.automl.tuner.config.{Configuration, ConfigurationSpace} + +/** + * Implementation of local search. + * + * @param acqFunc : The acquisition function which will be maximized + * @param configSpace : Configuration space of parameters + * @param epsilon : In order to perform a local move one of the incumbent's neighbors needs at least an improvement higher than epsilon + * @param numIters : Maximum number of iterations that the local search will perform + */ +class LocalSearch( + override val acqFunc: Acquisition, + override val configSpace: ConfigurationSpace, + epsilon: String, numIters: Int) + extends AcqOptimizer(acqFunc, configSpace) { + + /** + * Starts a local search from the given start point and quits if either the max number of steps is reached or + * no neighbor with an higher improvement was found + * + * @param numPoints : Number of queried points. + * @return A set of tuple(acquisition_value, Configuration). + */ + override def maximize(numPoints: Int, + sorted: Boolean = true): Array[(Double, Configuration)] = ??? + + override def maximize: (Double, Configuration) = ??? +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/optimizer/RandomSearch.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/optimizer/RandomSearch.scala new file mode 100644 index 0000000..0161b8e --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/acquisition/optimizer/RandomSearch.scala @@ -0,0 +1,69 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.acquisition.optimizer + +import com.tencent.angel.spark.automl.tuner.TunerParam +import com.tencent.angel.spark.automl.tuner.acquisition.Acquisition +import com.tencent.angel.spark.automl.tuner.config.{Configuration, ConfigurationSpace} +import org.apache.commons.logging.{Log, LogFactory} + +import scala.util.Random + +/** + * Get candidate solutions via random sampling of configurations. + * + * @param acqFunc : The acquisition function which will be maximized + * @param configSpace : Configuration space of parameters + */ +class RandomSearch( + override val acqFunc: Acquisition, + override val configSpace: ConfigurationSpace + ) extends AcqOptimizer(acqFunc, configSpace) { + + val LOG: Log = LogFactory.getLog(classOf[RandomSearch]) + + val rd = new Random() + + override def maximize(numPoints: Int, sorted: Boolean = true): Array[(Double, Configuration)] = { + //println(s"maximize RandomSearch") + val configs: Array[Configuration] = configSpace.sample(TunerParam.sampleSize) + if (configs.isEmpty) { + Array[(Double, Configuration)]() + } else { + //configs.foreach { config => + // println(s"sample a configuration: ${config.getVector.toArray.mkString(",")}") + //} + val retConfigs = if (sorted) { + configs.map { config => + (acqFunc.compute(config.getVector)._1, config) + }.sortWith(_._1 > _._1).take(numPoints) + } + else { + rd.shuffle(configs.map { config => + (acqFunc.compute(config.getVector)._1, config) + }.toTraversable).take(numPoints).toArray + } + retConfigs + } + } + + override def maximize: (Double, Configuration) = { + maximize(1, true).head + } +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/config/Configuration.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/config/Configuration.scala new file mode 100644 index 0000000..5d0a709 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/config/Configuration.scala @@ -0,0 +1,72 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.config + +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.param._ + +/** + * A single configuration + * + * @param configSpace : The configuration space for this configuration + * @param vector : A vector for efficient representation of configuration. + */ +class Configuration( + param2Idx: Map[String, Int], + param2Doc: Map[String, String], + vector: Vector) { + + def getVector: Vector = vector + + def getParamMap: ParamMap = { + val paramMap = ParamMap.empty + for (name: String <- param2Idx.keys) { + val param: Param[Double] = new Param(this.toString, name, param2Doc.getOrElse(name, "")) + paramMap.put(param, vector(param2Idx(name))) + } + paramMap + } + + def getParamMapWithParent: ParamMap = { + val paramMap = ParamMap.empty + for (name: String <- param2Idx.keys) { + val nameString = name.split("__") + val doc = param2Doc(name) + doc match { + case "Int" => + val param: Param[Int] = new Param(nameString(0), nameString(1), param2Doc.getOrElse(name, "")) + paramMap.put(param, vector(param2Idx(name)).toInt) + case _ => + val param: Param[Double] = new Param(nameString(0), nameString(1), param2Doc.getOrElse(name, "")) + paramMap.put(param, vector(param2Idx(name))) + } + } + paramMap + } + + def getValues: Array[Double] = vector.toArray + + def keys: List[String] = param2Idx.keys.toList + + def get(name: String): Double = get(param2Idx.getOrElse(name, -1)) + + def get(idx: Int): Double = vector(idx) + + def contains(name: String): Boolean = param2Idx.contains(name) +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/config/ConfigurationSpace.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/config/ConfigurationSpace.scala new file mode 100644 index 0000000..0f2e18f --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/config/ConfigurationSpace.scala @@ -0,0 +1,263 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.config + +import com.tencent.angel.spark.automl.tuner.TunerParam +import com.tencent.angel.spark.automl.tuner.math.BreezeOp._ +import com.tencent.angel.spark.automl.tuner.parameter.{ContinuousSpace, ParamSpace} +import com.tencent.angel.spark.automl.utils.AutoMLException +import org.apache.commons.logging.{Log, LogFactory} +import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.sql.types._ + +import scala.collection.mutable +import scala.collection.mutable.{ArrayBuffer, HashSet} +import scala.reflect.ClassTag + +class ConfigurationSpace( + val name: String, + private var paramDict: Map[String, ParamSpace[AnyVal]] = Map()) { + + val LOG: Log = LogFactory.getLog(classOf[ConfigurationSpace]) + + var numParams: Int = paramDict.size + + var fields: ArrayBuffer[StructField] = new ArrayBuffer[StructField]() + + var param2Idx: Map[String, Int] = paramDict.keys.zipWithIndex.toMap + var param2Doc: Map[String, String] = paramDict.map { case (k: String, v: ParamSpace[AnyVal]) => (k, v.doc) } + var idx2Param: Map[Int, String] = param2Idx.map(_.swap) + + // param name -> param type (continuous or discrete), value type (int, double,...) + val paramType: mutable.Map[String, (String, String)] = new mutable.HashMap[String, (String, String)]() + + // configurations tried + var preX: HashSet[Vector] = HashSet[Vector]() + var gridValues: Array[Configuration] = Array.empty + var gridIndice = 0 + + def getParamNum: Int = numParams + + def addParams(params: List[ParamSpace[AnyVal]]): Unit = { + params.foreach(addParam) + } + + def addParam[T <: AnyVal : ClassTag](param: ParamSpace[T]): Unit = { + if (!paramDict.contains(param.name)) { + fields += DataTypes.createStructField(param.name, DataTypes.DoubleType, false) + paramType += param.name -> (param.pType, param.vType) + paramDict += (param.name -> param) + param2Idx += (param.name -> numParams) + param2Doc += (param.name -> param.doc) + idx2Param += (numParams -> param.name) + numParams += 1 + } + println(s"add param ${param.toString}, current params: ${paramDict.keySet.mkString(",")}") + } + + def addParamType(pName: String, pType: String, vType: String): Unit = { + if (!paramType.contains(pName)) + paramType += pName -> (pType, vType) + } + + def getParamType(pName: String): (String, String) = { + if (paramType.contains(pName)) + paramType(pName) + else + throw new AutoMLException(s"param $pName not exists in the configuration space.") + } + + def getFields: Array[StructField] = fields.toArray + + def getParams(): Array[ParamSpace[AnyVal]] = paramDict.values.toArray + + def getParamByName(name: String): Option[ParamSpace[AnyVal]] = paramDict.get(name) + + def getIdxByParam(name: String): Option[Int] = param2Idx.get(name) + + def getParamByIdx(idx: Int): Option[ParamSpace[AnyVal]] = paramDict.get(idx2Param.getOrElse(idx, "none")) + + def getDocByName(name: String): Option[String] = param2Doc.get(name) + + def addHistories(vecs: Array[Vector]): Unit = preX ++= vecs + + def addHistory(vec: Vector): Unit = preX += vec + + def setAllToGrid(): Unit = { + getParams().foreach { + case cParam: ContinuousSpace => + if (!cParam.isGrid) cParam.resetGrid(TunerParam.defaultGridSize) + case _ => + } + } + + def spaceSize(): Int = { + var size: Int = if (numParams > 0) 1 else 0 + var hasInfinite = false + getParams().foreach { param => + param.numValues match { + case Int.MaxValue => hasInfinite = true + case _ => size *= param.numValues + } + } + if (hasInfinite) Int.MaxValue else size + } + + def sample(size: Int): Array[Configuration] = { + var configs: ArrayBuffer[Configuration] = new ArrayBuffer[Configuration] + + var missing: Int = 0 + val left = if (spaceSize() == Int.MaxValue) Int.MaxValue else spaceSize - preX.size + val trueSize = left min size + println(s"configuration space size ${spaceSize()}, remaining $left, sample $trueSize") + do { + missing = trueSize - configs.length + val vectors: Array[Vector] = Array.fill(missing)(Vectors.dense(new Array[Double](numParams))) + param2Idx.foreach { case (paramName, paramIdx) => + paramDict.get(paramName) match { + case Some(param) => + param.sample(missing).map(asDouble).zipWithIndex.foreach { case (f: Double, i: Int) => + vectors(i).toArray(paramIdx) = f + } + case None => LOG.info(s"Cannot find $paramName.") + } + } + val validVectors = vectors.filter(isValid) + validVectors.foreach { vec => + configs += new Configuration(param2Idx, param2Doc, vec) + } + } while (configs.length < trueSize) + + configs.toArray + } + + def randomSample(size: Int): Array[Configuration] = { + var configs: ArrayBuffer[Configuration] = new ArrayBuffer[Configuration] + + var missing: Int = 0 + val left = if (spaceSize() == Int.MaxValue) Int.MaxValue else spaceSize - preX.size + val trueSize = left min size + println(s"configuration space size ${spaceSize()}, remaining $left, sample $trueSize") + do { + missing = trueSize - configs.length + val vectors: Array[Vector] = Array.fill(missing)(Vectors.dense(new Array[Double](numParams))) + param2Idx.foreach { case (paramName, paramIdx) => + paramDict.get(paramName) match { + case Some(param) => + param.sample(missing).map(asDouble).zipWithIndex.foreach { case (f: Double, i: Int) => + vectors(i).toArray(paramIdx) = f + } + case None => LOG.info(s"Cannot find $paramName.") + } + } + val validVectors = vectors.filter(isValid) + validVectors.foreach { vec => + configs += new Configuration(param2Idx, param2Doc, vec) + } + } while (configs.length < trueSize) + configs.toArray + } + + def gridSample(size: Int): Array[Configuration] = { + if (gridValues.isEmpty) { + gridValues = getGridConfigs() + } + val startIndice = gridIndice + val endIndice = (gridIndice + size) min gridValues.size + println(s"configuration space size ${gridValues.size}, " + + s"remaining ${gridValues.size - startIndice}, sample from $startIndice to $endIndice") + gridIndice = endIndice + if (startIndice == gridValues.size) { + Array.empty + } else { + val ret = new Array[Configuration](endIndice - startIndice) + Array.copy(gridValues, startIndice, ret, 0, endIndice - startIndice) + ret + } + } + + def getGridConfigs(): Array[Configuration] = { + //assert(spaceSize() < Int.MaxValue, "all parameters must be discrete!") + //println(s"configuration space size ${spaceSize()}") + var configs: ArrayBuffer[Configuration] = new ArrayBuffer[Configuration] + + var tmp: ArrayBuffer[Array[Double]] = new ArrayBuffer[Array[Double]] + + val params = getParams() + + params.foreach { + tmp += _.getValues + } + + val paramsArray: Array[Array[Double]] = tmp.toArray + + if (numParams == 1) { + var tmp: ArrayBuffer[Vector] = new ArrayBuffer[Vector] + paramsArray.head.foreach { + tmp += Vectors.dense(_) + } + val paramsVec = tmp.toArray + paramsVec.filter(isValid).foreach { vec => + configs += new Configuration(param2Idx, param2Doc, vec) + } + configs.toArray + } else if (numParams == 2) { + val paramsGrid: Array[Array[Double]] = cartesian(paramsArray(0), paramsArray(1)) + var tmp: ArrayBuffer[Vector] = new ArrayBuffer[Vector] + paramsGrid.foreach { + tmp += Vectors.dense(_) + } + val paramsVec: Array[Vector] = tmp.toArray + paramsVec.filter(isValid).foreach { vec => + configs += new Configuration(param2Idx, param2Doc, vec) + } + configs.toArray + } else { + var paramsGrid: Array[Array[Double]] = cartesian(paramsArray(0), paramsArray(1)) + + paramsArray.foreach { a => + if (!(a sameElements paramsArray(0)) && !(a sameElements paramsArray(1))) { + paramsGrid = cartesian(paramsGrid, a) + } + } + + var tmp: ArrayBuffer[Vector] = new ArrayBuffer[Vector] + paramsGrid.foreach { + tmp += Vectors.dense(_) + } + val paramsVec: Array[Vector] = tmp.toArray + paramsVec.filter(isValid).foreach { vec => + configs += new Configuration(param2Idx, param2Doc, vec) + } + configs.toArray + } + } + + def asDouble(num: AnyVal): Double = { + num match { + case i: Int => i.toDouble + case i: Long => i.toLong + case i: Float => i.toDouble + case i: Double => i + case _ => throw new AutoMLException(s"type ${num.getClass} is not supported") + } + } + + def isValid(vec: Vector): Boolean = !preX.contains(vec) +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/config/EarlyStopping.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/config/EarlyStopping.scala new file mode 100644 index 0000000..7f3eb03 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/config/EarlyStopping.scala @@ -0,0 +1,59 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.config + +/** + * A single configuration + * + * @param patience : How long to wait after last time validation loss improved. + * Default: 5 + * @param minimize : Whether to minimize or maximize the val_score + * Default: false + */ +class EarlyStopping(patience: Int = 5, + minDelta: Double = 0.0, + minimize: Boolean = false) { + + var counter: Int = 0 + var bestScore: Double = if (minimize) Double.PositiveInfinity else Double.NegativeInfinity + var earlyStop: Boolean = false + val pat = patience + + def greater(a: Double, b: Double): Boolean = a > b + + def less(a: Double, b: Double): Boolean = a < b + + val monitorOp: (Double, Double) => Boolean = if (minimize) less else greater + + def bound(score: Double): Double = if (minimize) score + minDelta else score - minDelta + + def update(val_score: Double): Unit = { + val score = val_score + if (monitorOp(bound(score), bestScore)) { + bestScore = score + counter = 0 + } else { + counter += 1 + println(s"EarlyStopping counter: $counter out of $patience") + if (counter >= patience) { + earlyStop = true + } + } + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/Covariance.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/Covariance.scala new file mode 100644 index 0000000..4f2de3c --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/Covariance.scala @@ -0,0 +1,52 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.kernel + +import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} + +/** + * Covariance function given two points. + */ +trait Covariance { + + /** + * the covariance function + * + * @param x1 + * @param x2 + * @param params + * @return + */ + def cov(x1: BDM[Double], + x2: BDM[Double], + params: BDV[Double]): BDM[Double] + + /** + * the derivative of covariance function against kernel hyper-parameters + * + * @param x1 + * @param x2 + * @param params + * @return + */ + def grad(x1: BDM[Double], + x2: BDM[Double], + params: BDV[Double]): Array[BDM[Double]] + +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/CovarianceType.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/CovarianceType.scala new file mode 100644 index 0000000..46d36ad --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/CovarianceType.scala @@ -0,0 +1,42 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.kernel + +object CovarianceType extends Enumeration { + + type CovarianceType = Value + + val MATERN3 = Value("MATERN3") + val MATERN5 = Value("MATERN5") + val MATERN5_ISO = Value("MATERN5_ISO") + val SQUAREEXP_ISO = Value("SQUAREEXP_ISO") + + def fromString(name: String): Covariance = { + val covType = CovarianceType.withName(name.toUpperCase()) + fromString(covType) + } + + def fromString(covType: CovarianceType.Value): Covariance = covType match { + case MATERN3 => new Matern3 + case MATERN5 => new Matern5 + case MATERN5_ISO => new Matern5Iso + case SQUAREEXP_ISO => new SquareExpIso + case _ => new Matern5 + } +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/Matern3.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/Matern3.scala new file mode 100644 index 0000000..7bd2c99 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/Matern3.scala @@ -0,0 +1,91 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.kernel + +import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV, _} +import breeze.numerics.{exp, pow, sqrt} +import com.tencent.angel.spark.automl.tuner.math.SquareDist + +/** + * Matern covariance function with v = 3/2 + * (1 + sqrt(3)*r/l) * exp(-sqrt(3)*r/l) + * Here r is the distance |x1-x2| of two points + * Hyper-parameter: l is the length scale + */ +case class Matern3() extends Covariance { + + /** + * the covariance function + * + * @param x1 + * @param x2 + * @param params + * @return + */ + override def cov(x1: BDM[Double], + x2: BDM[Double], + params: BDV[Double]): BDM[Double] = { + + require(params.size == 1, + s"Number of hyper parameters is ${params.length} while expected 1") + + val l = params(0) + + val distMat = SquareDist(x1, x2) + val r = sqrt(distMat) + + val vPart = sqrt(3) * r / l + 1.0 + val expPart = exp(-sqrt(3) * r / l) + val covMatrix = vPart *:* expPart + + covMatrix + } + + /** + * the derivative of covariance function against kernel hyper-parameters + * + * @param x1 + * @param x2 + * @param params + * @return + */ + override def grad(x1: BDM[Double], + x2: BDM[Double], + params: BDV[Double]): Array[BDM[Double]] = { + + require(params.size == 1, + s"Number of hyper parameters is ${params.length} while expected 1") + + val l = params(0) + + val distMat = SquareDist(x1, x2) + val r = sqrt(distMat) + + val vPart = sqrt(3) * r / l + 1.0 + val expPart = exp(-sqrt(3) * r / l) + + val vPartGrad = -(sqrt(3) * r / pow(l, 2)) *:* expPart + val expPartGrad = vPart *:* expPart *:* (sqrt(3) * r / pow(l, 2)) + + val gradL = vPartGrad + expPartGrad + + Array(gradL) + } +} + diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/Matern5.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/Matern5.scala new file mode 100644 index 0000000..ed95c67 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/Matern5.scala @@ -0,0 +1,90 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.kernel + +import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} +import breeze.numerics._ +import com.tencent.angel.spark.automl.tuner.math.SquareDist + +/** + * Matern covariance function with v = 5/2 + * (1 + sqrt(5)*r/l + 5r^2/(3l^2)) * exp(-sqrt(5)*r/l) + * Here r is the distance |x1-x2| of two points + * Hyper-parameter: l is the length scale + */ +case class Matern5() extends Covariance { + + /** + * the covariance function + * + * @param x1 + * @param x2 + * @param params + * @return + */ + override def cov(x1: BDM[Double], + x2: BDM[Double], + params: BDV[Double]): BDM[Double] = { + + require(params.size == 1, + s"Number of hyper parameters is ${params.length} while expected 1") + + val l = params(0) + + val distMat = SquareDist(x1, x2) + val r = sqrt(distMat) + + val vPart = sqrt(5) * r / l + 5.0 / 3.0 * distMat / pow(l, 2) + 1.0 + val expPart = exp(-sqrt(5) * r / l) + val covMatrix = vPart *:* expPart + + covMatrix + } + + /** + * the derivative of covariance function against kernel hyper-parameters + * + * @param x1 + * @param x2 + * @param params + * @return + */ + override def grad(x1: BDM[Double], + x2: BDM[Double], + params: BDV[Double]): Array[BDM[Double]] = { + + require(params.size == 1, + s"Number of hyper parameters is ${params.length} while expected 1") + + val l = params(0) + + val distMat = SquareDist(x1, x2) + val r = sqrt(distMat) + + val vPart = sqrt(5) * r / l + 5.0 / 3.0 * distMat / pow(l, 2) + 1.0 + val expPart = exp(-sqrt(5) * r / l) + + val vPartGrad = -(sqrt(5) * r / pow(l, 2) + 10.0 * distMat / (3.0 * pow(l, 3))) *:* expPart + val expPartGrad = vPart *:* expPart *:* (sqrt(5) * r / pow(l, 2)) + + val gradL = vPartGrad + expPartGrad + + Array(gradL) + } +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/Matern5Iso.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/Matern5Iso.scala new file mode 100644 index 0000000..a18385c --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/Matern5Iso.scala @@ -0,0 +1,93 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.kernel + +import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} +import breeze.numerics._ +import com.tencent.angel.spark.automl.tuner.math.SquareDist + +/** + * Matern covariance function with v = 5/2 and isotropic distance measure + * theta^2 * (1 + sqrt(5)*r/l + 5r^2/(3l^2)) * exp(-sqrt(5)*r/l) + * Here r is the distance |x1-x2| of two points + * Hyper-parameter: theta is the signal variance, l is the length scale + **/ +case class Matern5Iso() extends Covariance { + + /** + * the covariance function + * + * @param x1 + * @param x2 + * @param params + * @return + */ + override def cov(x1: BDM[Double], + x2: BDM[Double], + params: BDV[Double]): BDM[Double] = { + + require(params.size == 2, + s"Number of hyper parameters is ${params.length} while expected 2") + + val theta = params(0) + val l = params(1) + + val distMat = SquareDist(x1, x2) + val r = sqrt(distMat) + + val vPart = (sqrt(5) * r) / l + distMat / pow(l, 2) * 5.0 / 3.0 + 1.0 + val expPart = exp(-sqrt(5) * r / l) + val covMatrix = pow(theta, 2) * vPart *:* expPart + // println(covMatrix) + covMatrix + } + + /** + * the derivative of covariance function against kernel hyper-parameters + * + * @param x1 + * @param x2 + * @param params + * @return + */ + override def grad(x1: BDM[Double], + x2: BDM[Double], + params: BDV[Double]): Array[BDM[Double]] = { + + require(params.size == 2, + s"Number of hyper parameters is ${params.length} while expected 2") + + val theta = params(0) + val l = params(1) + + val distMat = SquareDist(x1, x2) + val r = sqrt(distMat) + + val vPart = sqrt(5) * r / l + 5.0 / 3.0 * distMat / pow(l, 2) + 1.0 + val expPart = exp(-sqrt(5) * r / l) + + val vPartGrad = -(sqrt(5) * r / pow(l, 2) + 10.0 * distMat / (3.0 * pow(l, 3))) *:* expPart * pow(theta, 2) + val expPartGrad = vPart *:* expPart *:* (sqrt(5) * r / pow(l, 2)) * pow(theta, 2) + + val gradL = vPartGrad + expPartGrad + val gradTheta = vPart *:* expPart * 2.0 * theta + // println(cov_l_grad) + Array(gradTheta, gradL) + } +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/SquareExpIso.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/SquareExpIso.scala new file mode 100644 index 0000000..42d0cc1 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/kernel/SquareExpIso.scala @@ -0,0 +1,85 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ +package com.tencent.angel.spark.automl.tuner.kernel + +import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} +import breeze.numerics._ +import com.tencent.angel.spark.automl.tuner.math.SquareDist + +/** + * Square exponential covariance function with isotropic distance measure + * k(x1, x2) = theta^2 * exp( -(x1-x2)^2 / l^2 ) + * Hyper-parameter: theta is the signal variance, l is the length scale + **/ +case class SquareExpIso() extends Covariance { + + /** + * the covariance function + * + * @param x1 + * @param x2 + * @param params + * @return + */ + override def cov(x1: BDM[Double], + x2: BDM[Double], + params: BDV[Double]): BDM[Double] = { + + require(params.size == 2, + s"Number of hyper parameters is ${params.length} while expected 2") + + val theta = params(0) + val l = params(1) + + val distMat = SquareDist(x1, x2) + + val covMatrix = pow(theta, 2) * exp(-0.5 * distMat / pow(l, 2)) + + covMatrix + } + + /** + * the derivative of covariance function against kernel hyper-parameters + * + * @param x1 + * @param x2 + * @param params + * @return + */ + override def grad(x1: BDM[Double], + x2: BDM[Double], + params: BDV[Double]): Array[BDM[Double]] = { + + require(params.size == 2, + s"Number of hyper parameters is ${params.length} while expected 2") + + val theta = params(0) + val l = params(1) + + val distMat = SquareDist(x1, x2) + val r = sqrt(distMat) + + val expDistMat = exp(-0.5 * distMat / pow(l, 2)) + + val gradTheta = 2 * theta * expDistMat + + val gradL = pow(theta, 2) * expDistMat *:* distMat / pow(l, 3) + + Array(gradTheta, gradL) + } +} + diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/math/BreezeOp.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/math/BreezeOp.scala new file mode 100644 index 0000000..e861d20 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/math/BreezeOp.scala @@ -0,0 +1,86 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.math + +import breeze.linalg.{cholesky, diag, inv, sum, trace, DenseMatrix => BDM, DenseVector => BDV} +import breeze.numerics.log + +import scala.math.Pi + +object BreezeOp { + + /** + * calculate the inverse of a matrix with cholesky decomposition + * + * @param L : the Cholesky decomposition of matrix A where A = L'*L + * @return inv(A)=inv(L)*inv(L') + */ + def choleskyInv(L: BDM[Double]): BDM[Double] = { + val invL = inv(L) + invL * invL.t + } + + /** + * sum of log diag of positive definite matrices + * + * @param L + * @return + */ + def sumLogDiag(L: BDM[Double]): Double = { + 2 * sum(log(diag(L))) + } + + def logLike(meanX: BDV[Double], + KXX: BDM[Double], + invKXX: BDM[Double], + y: BDV[Double]): Double = { + + val m = meanX + + val logDiag = sumLogDiag(cholesky(KXX)) + + val value = -0.5 * (y - m).t * invKXX * (y - m) - 0.5 * logDiag - 0.5 * meanX.size * scala.math.log(2 * Pi) + + value(0) + } + + def logLikeD(meanX: BDV[Double], + invKXX: BDM[Double], + y: BDV[Double], + covGrads: Array[BDM[Double]]): BDV[Double] = { + + val m = meanX + val alpha = invKXX * (y - m) + + val grads = covGrads.map { covGrad => + val tmp = alpha * alpha.t - invKXX + 0.5 * trace(tmp * covGrad) + } + + BDV(grads) + } + + def cartesian(A: Array[Double], B: Array[Double]) = for (a <- A; b <- B) yield { + Array(a, b) + } + + def cartesian(A: Array[Array[Double]], B: Array[Double]) = for (a <- A; b <- B) yield { + (a.toBuffer += b).toArray + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/math/SquareDist.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/math/SquareDist.scala new file mode 100644 index 0000000..facd037 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/math/SquareDist.scala @@ -0,0 +1,47 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.math + +import breeze.generic.UFunc +import breeze.linalg.{DenseMatrix => BDM, _} + +/** + * Computes pair-wise square distances between matrices x1 and x2. + * + * @param x1 [N x D] + * @param x2 [M x D] + * @return matrix of square distances [N x M] + */ +object SquareDist extends UFunc { + + implicit object implBinary + extends Impl2[BDM[Double], BDM[Double], BDM[Double]] { + + def apply(x1: BDM[Double], + x2: BDM[Double]): BDM[Double] = { + + val t1 = -2.0 * (x1 * x2.t) + + val t2 = t1(*, ::) + sum(x2.t *:* x2.t, Axis._0).t + + t2(::, *) + sum(x1.t *:* x1.t, Axis._0).t + } + } + +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/model/GPExample.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/model/GPExample.scala new file mode 100644 index 0000000..a06c894 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/model/GPExample.scala @@ -0,0 +1,67 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.model + +import breeze.linalg.{DenseMatrix, DenseVector} +import com.tencent.angel.spark.automl.tuner.kernel.Matern5Iso + +object GPExample { + + def main(args: Array[String]): Unit = { + + val X = DenseMatrix((1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)).t + val y = 2.0 * DenseVector(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0) + val z = DenseMatrix((2.5, 4.5, 6.5, 8.5, 10.0, 12.0)).t + val truePredZ = 2.0 * DenseVector(2.5, 4.5, 6.5, 8.5, 10.0, 12.0) + + // //2.Test no_linear(y=cos(x)+1) + // val X = DenseMatrix((1.0,2.0, 3.0,4.0,5.0,6.0,7.0,8.0,9.0)).t + // val y = cos(DenseVector(1.0,2.0, 3.0,4.0,5.0,6.0,7.0,8.0,9.0))+1.0 + // val z = DenseMatrix((2.5, 4.5,6.5,8.5,10.0,12.0)).t + // val truePredZ = cos(DenseVector(2.5, 4.5,6.5,8.5,10.0,12.0))+1.0 + + // //3.Test no_linear(y=x^2) + // val X = DenseMatrix((1.0,2.0, 3.0,4.0,5.0,6.0,7.0,8.0,9.0)).t + // val y = DenseVector(1.0,4.0, 9.0,16.0,25.0,36.0,49.0,64.0,81.0) + // val z = DenseMatrix((2.5, 4.5,6.5,8.5,10.0,12.0)).t + // val truePredZ = pow(z,2) + + //val covFunc = SquareExpIso() + val covFunc = Matern5Iso() + val initCovParams = DenseVector(1.0, 1.0) + val initNoiseStdDev = 0.01 + + val gpModel = GPModel(covFunc, initCovParams, initNoiseStdDev) + + gpModel.fit(X, y) + + println("Fitted covariance function params:") + println(gpModel.covParams) + println("Fitted noiseStdDev:") + println(gpModel.noiseStdDev) + println("\n") + + val prediction = gpModel.predict(z) + println("Mean and Var:") + println(prediction) + println("True value:") + println(truePredZ) + } + +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/model/GPKernelDiffFunc.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/model/GPKernelDiffFunc.scala new file mode 100644 index 0000000..33e73d5 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/model/GPKernelDiffFunc.scala @@ -0,0 +1,84 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.model + +import breeze.linalg.{MatrixNotSymmetricException, NotConvergedException, DenseMatrix => BDM, DenseVector => BDV} +import breeze.optimize.DiffFunction +import com.tencent.angel.spark.automl.tuner.math.BreezeOp + +class GPKernelDiffFunc(model: GPModel) extends DiffFunction[BDV[Double]] { + + var iter: Int = _ + + override def calculate(params: BDV[Double]): (Double, BDV[Double]) = { + + try { + //println(s"------iteration $iter------") + val covParams = BDV(params.toArray.dropRight(1)) + model.covParams = covParams + val noiseStdDev = params.toArray.last + model.noiseStdDev = noiseStdDev + //println(s"covariance params: $covParams") + //println(s"standard derivative: $noiseStdDev") + + val meanX = model.meanFunc(model.X) + val KXX = model.calKXX() + + //println(s"meanX: $meanX") + //println(s"KXX: $KXX") + + val invKXX = model.calInvKXX(KXX) + //println("inverse of KXX:") + //println(invKXX) + + //println("true inverse of KXX:") + //println(inv(KXX)) + + val loglikeLoss = -BreezeOp.logLike(meanX, KXX, invKXX, model.y) + //println(s"log likelihood loss: $loglikeLoss") + + // calculate partial derivatives + val covarFuncGrads = model.covFunc.grad(model.X, model.X, covParams) + //println("covariance grads:") + //covarFuncGrads.foreach(println) + + val covarNoiseGrad = 2 * noiseStdDev * BDM.eye[Double](model.X.rows) + //println("covariance noise grads:") + //println(covarNoiseGrad) + + val allGrads = covarFuncGrads :+ covarNoiseGrad + + val loglikeGrads = BreezeOp.logLikeD(meanX, invKXX, model.y, allGrads).map(d => -d) + //println(s"grad of covariance params: $loglikeGrads") + + iter = iter + 1 + + (loglikeLoss, loglikeGrads) + } catch { + case e: NotConvergedException => + //println(s"not converge exception $e") + //(Double.NaN, BDV.zeros[Double](params.size) * Double.NaN) + throw e + case e: MatrixNotSymmetricException => + println(s"matrix not symmetric exception $e") + (Double.NaN, BDV.zeros[Double](params.size) * Double.NaN) + throw e + } + } +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/model/GPModel.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/model/GPModel.scala new file mode 100644 index 0000000..5623502 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/model/GPModel.scala @@ -0,0 +1,177 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.model + +import breeze.linalg.{Axis, MatrixNotSymmetricException, cholesky, diag, DenseMatrix => BDM, DenseVector => BDV} +import breeze.optimize.LBFGS +import com.tencent.angel.spark.automl.tuner.kernel.{Covariance, CovarianceType} +import com.tencent.angel.spark.automl.tuner.math.BreezeOp + +import scala.math._ + +class GPModel(val covFunc: Covariance, + var covParams: BDV[Double], + var noiseStdDev: Double, + val meanFunc: (BDM[Double]) => BDV[Double]) { + + var X: BDM[Double] = _ + var y: BDV[Double] = _ + var KXX: BDM[Double] = _ + var L: BDM[Double] = _ + + def remove(idx: Int): Unit = { + } + + def fit(newX: BDM[Double], + newy: BDV[Double]): Boolean = { + require(newX.rows == newy.length, "incompatible size of the input X and y") + + var trainSuccess = true + + if ((X == null && y == null) || + (newX.rows > X.rows && newy.length > y.length)) { + X = newX + y = newy + } + + val kernelDiffFunc = new GPKernelDiffFunc(this) + val initParams = BDV(covParams.toArray :+ noiseStdDev) + //println(s"init params: ${initParams}") + + var newParams = initParams + val optimizer = new LBFGS[BDV[Double]](maxIter = 10, m = 7, tolerance = 1e-10) + //val optimizer = new SimpleSGD[BDV[Double]](1, 10) + try { + newParams = optimizer.minimize(kernelDiffFunc, initParams) + } catch { + case _: breeze.linalg.NotConvergedException | _: MatrixNotSymmetricException => + //println(s"Breeze Not Converged Exception") + newParams = initParams + trainSuccess = false + X = X.delete(X.rows - 1, Axis._0) + y = y.slice(0, y.length - 1) + } + + // println(optimizer) + // println(s"new params: ${newParams}") + // if(!checkParam(newParams)) { + // newParams = initParams + // println(s"reset to init params: ${newParams}") + // trainSuccess = false + // println(s"history size: ${X.rows} ${y.length}") + // X = X.delete(X.rows - 1, Axis._0) + // y = y.slice(0, y.length - 1) + // println(s"history size: ${X.rows} ${y.length}") + // } + + val newCovParams = BDV(newParams.toArray.dropRight(1)) + val newNoiseStdDev = newParams.toArray.last + + this.covParams = newCovParams + this.noiseStdDev = newNoiseStdDev + + trainSuccess + } + + def checkParam(params: BDV[Double]): Boolean = { + var isValid = true + params.values.foreach { param: Double => + if (param.isNaN || param.isInfinity) + isValid = false + } + isValid + } + + def update(newX: BDM[Double], + newy: BDV[Double]): this.type = { + this + } + + def predict(newX: BDM[Double]): BDM[Double] = { + if (X == null || y == null) { + BDM.zeros(newX.rows, cols = 2) + } else { + val meanX = meanFunc(X) + + val KXX = calKXX() + + val invKXX = calInvKXX(KXX) + + val KXZ = covFunc.cov(X, newX, covParams) + + val KZZ = covFunc.cov(newX, newX, covParams) + + val meanNewX = meanFunc(newX) + + val predMean = meanNewX + KXZ.t * (invKXX * (y - meanX)) + val predVar = diag(KZZ - KXZ.t * invKXX * KXZ).map { v => + if (v < -1e-12 | v.isNaN | v.isInfinite) 0 else v + } + + BDV.horzcat(predMean, predVar) + } + } + + def calKXX(): BDM[Double] = { + val KXX = covFunc.cov(X, X, covParams) + + pow(noiseStdDev, 2) * BDM.eye[Double](X.rows) + //+ BDM.eye[Double](X.rows) * 1e-7 + + KXX + } + + def calInvKXX(KXX: BDM[Double]): BDM[Double] = { + val l = cholesky(KXX) + val invKXX = BreezeOp.choleskyInv(l.t) + + invKXX + } +} + +object GPModel { + + def apply(covFunc: Covariance, + covParams: BDV[Double], + noiseStdDev: Double, + meanFunc: (BDM[Double]) => BDV[Double]): GPModel = { + new GPModel(covFunc, covParams, noiseStdDev, meanFunc) + } + + def apply(covFunc: Covariance, + covParams: BDV[Double], + noiseStdDev: Double, + mean: Double = 0.0): GPModel = { + val meanFunc = (x: BDM[Double]) => BDV.zeros[Double](x.rows) + mean + new GPModel(covFunc, covParams, noiseStdDev, meanFunc) + } + + def apply(covName: String, + covParams: BDV[Double], + noiseStdDev: Double, + meanFunc: (BDM[Double]) => BDV[Double]): GPModel = { + new GPModel(CovarianceType.fromString(covName), covParams, noiseStdDev, meanFunc) + } + + def apply(covType: CovarianceType.Value, + covParams: BDV[Double], + noiseStdDev: Double, + meanFunc: (BDM[Double]) => BDV[Double]): GPModel = { + new GPModel(CovarianceType.fromString(covType), covParams, noiseStdDev, meanFunc) + } +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/parameter/ContinuousSpace.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/parameter/ContinuousSpace.scala new file mode 100644 index 0000000..1a494a9 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/parameter/ContinuousSpace.scala @@ -0,0 +1,152 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.parameter + +import com.tencent.angel.spark.automl.utils.{AutoMLException, Distribution} + +import scala.collection.mutable.ArrayBuffer +import scala.util.Random + +/** + * + * @param name : Name of the parameter + * @param lower : Start of the continuous space included. + * @param upper : End of the continuous space included. + * @param num : Sampling count if possible. + */ +class ContinuousSpace( + override val name: String, + var lower: Double, + var upper: Double, + var num: Int, + distribution: Distribution.Value = Distribution.LINEAR, + override val doc: String = "continuous param space" + ) extends ParamSpace[Double](name, doc) { + + private val helper: String = "supported format of continuous parameter: [0,1] or [0:1:100]" + + override val pType: String = "continuous" + override val vType: String = "double" + + def this(name: String, lower: Double, upper: Double) = { + this(name, lower, upper, -1) + } + + def this(name: String, config: String) = { + this(name, 0, 1, -1) + val items = parseConfig(config) + lower = items._1 + upper = items._2 + num = items._3 + resetGrid(num) + } + + def this(name: String, config: String, doc: String) = { + this(name, 0, 1, -1, doc=doc) + val items = parseConfig(config) + lower = items._1 + upper = items._2 + num = items._3 + resetGrid(num) + } + + require(lower < upper, s"lower bound should less than upper bound") + + val rd = new Random() + + var isGrid: Boolean = false + var gridValues: Array[Double] = _ + + def parseConfig(input: String): (Double, Double, Int) = { + assert(input.startsWith("[") && input.endsWith("]")) + val config = input.substring(1, input.length - 1) + val ret: (Double, Double, Int) = config.trim match { + case _ if config.contains(",") => + val splits = config.split(',') + splits.length match { + case 2 => (splits(0).toDouble, splits(1).toDouble, -1) + case _ => throw new AutoMLException(s"invalid discrete, $helper") + } + case _ if config.contains(":") => + val splits = config.split(':') + splits.length match { + case 3 => (splits(0).toDouble, splits(1).toDouble, splits(2).toInt) + case _ => throw new AutoMLException(s"invalid discrete, $helper") + } + case _ => throw new AutoMLException(s"invalid discrete, $helper") + } + ret + } + + def getGridValues(num: Int): Array[Double] = { + var ret: ArrayBuffer[Double] = ArrayBuffer[Double]() + distribution match { + case Distribution.LINEAR => + val interval: Double = (upper - lower) / (num - 1) + (0 until num).foreach { i => + ret += lower + i * interval + } + case _ => println(s"Distribution $distribution not supported") + } + ret.toArray + } + + def resetGrid(numGrid: Int): Unit = { + num = numGrid + isGrid = if (numGrid < 0) false else true + gridValues = if (isGrid) getGridValues(numGrid) else Array.empty + } + + def getLower: Double = lower + + def getUpper: Double = upper + + def getValues: Array[Double] = gridValues + + def numValues: Int = if (isGrid) gridValues.length else Int.MaxValue + + def toGridSearch: ParamSpace[Double] = this + + def toRandomSpace: ParamSpace[Double] = this + + override def sample(size: Int): List[Double] = List.fill[Double](size)(sampleOne) + + def sampleOne(): Double = { + if (isGrid) + gridValues(rd.nextInt(numValues)) + else + lower + (upper - lower) * rd.nextDouble() + } + + override def toString: String = + if (isGrid) + s"ContinuousSpace[$name]: (${gridValues mkString (",")})" + else s"ContinuousSpace[$name]: ($lower -> $upper)" +} + +object ContinuousSpace { + + def apply(name: String, config: String) = { + new ContinuousSpace(name, config) + } + + def apply(name: String, config: String, doc: String) = { + new ContinuousSpace(name, config, doc=doc) + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/parameter/DiscreteSpace.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/parameter/DiscreteSpace.scala new file mode 100644 index 0000000..d793098 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/parameter/DiscreteSpace.scala @@ -0,0 +1,138 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.parameter + +import com.tencent.angel.spark.automl.utils.AutoMLException + +import scala.reflect.{ClassTag, _} +import scala.util.Random + +/** + * Search space with discrete values + * + * @param name : Name of the parameter + * @param values : List of all possible values + */ +class DiscreteSpace[T <: AnyVal : ClassTag]( + override val name: String, + var values: Array[T], + override val doc: String = "discrete param" + ) extends ParamSpace[T](name, doc) { + + private val helper: String = "supported format of discrete parameter: {0.1,0.2,0.3,0.4} or {0.1:1:0.1}" + + override val pType: String = "discrete" + override val vType = classTag[T].runtimeClass.getSimpleName.toLowerCase + + def this(name: String, config: String, doc: String) = { + this(name, Array.empty[T], doc) + this.values = parseConfig(config) + } + + def this(name: String, config: String) = { + this(name, config, "discrete param") + } + + def parseConfig(input: String): Array[T] = { + assert(input.startsWith("{") && input.endsWith("}")) + val config = input.substring(1, input.length - 1) + val values: Array[T] = config.trim match { + case _ if config.contains(",") => + config.split(',').map(asType) + case _ if config.contains(":") => + val splits = config.split(':') + splits.length match { + case 2 => (splits(0).toDouble to splits(1).toDouble by 1.0f).toArray.map(asType) + case 3 => (splits(0).toDouble to splits(1).toDouble by splits(2).toDouble).toArray.map(asType) + case _ => throw new AutoMLException(s"invalid discrete, $helper") + } + case _ => throw new AutoMLException(s"invalid discrete, $helper") + } + values + } + + def asType(s: String): T = { + val c = implicitly[ClassTag[T]].runtimeClass + c match { + case _ if c == classOf[Int] => s.toInt.asInstanceOf[T] + case _ if c == classOf[Long] => s.toLong.asInstanceOf[T] + case _ if c == classOf[Float] => s.toFloat.asInstanceOf[T] + case _ if c == classOf[Double] => s.toDouble.asInstanceOf[T] + case _ => throw new AutoMLException(s"auto param with type ${c} is not supported") + } + } + + def asType(s: Double): T = { + val c = implicitly[ClassTag[T]].runtimeClass + c match { + case _ if c == classOf[Int] => s.toInt.asInstanceOf[T] + case _ if c == classOf[Long] => s.toLong.asInstanceOf[T] + case _ if c == classOf[Float] => s.toFloat.asInstanceOf[T] + case _ if c == classOf[Double] => s.toDouble.asInstanceOf[T] + case _ => throw new AutoMLException(s"auto param with type ${c} is not supported") + } + } + + def asDouble(num: AnyVal): Double = { + num match { + case i: Int => i.toDouble + case i: Long => i.toLong + case i: Float => i.toDouble + case i: Double => i + case _ => throw new AutoMLException(s"type ${num.getClass} is not supported") + } + } + + val rd = new Random() + + def getValues: Array[Double] = values.map(asDouble) + + def numValues: Int = values.length + + def toGridSearch: ParamSpace[T] = this + + def toRandomSpace: ParamSpace[T] = this + + def sample(size: Int): List[T] = { + List.fill[T](size)(sampleOne) + } + + def sampleOne(): T = values(rd.nextInt(numValues)) + + override def toString: String = s"DiscreteSpace[$name]: (${values mkString (",")})" + +} + +object DiscreteSpace { + + def apply[T <: AnyVal : ClassTag](name: String, config: String): DiscreteSpace[T] = { + new DiscreteSpace[T](name, config) + } + + def apply[T <: AnyVal : ClassTag](name: String, config: String, doc: String): DiscreteSpace[T] = { + new DiscreteSpace[T](name, config, doc) + } + + def main(args: Array[String]): Unit = { + val obj = new DiscreteSpace[Int]("test", "1:10:1") + println(obj.toString) + println(obj.getValues(1)) + println(obj.sample(2).toString()) + } +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/parameter/ParamParser.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/parameter/ParamParser.scala new file mode 100644 index 0000000..fe11f95 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/parameter/ParamParser.scala @@ -0,0 +1,114 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + +package com.tencent.angel.spark.automl.tuner.parameter + +import com.tencent.angel.spark.automl.utils.AutoMLException + +import scala.beans.BeanProperty + +/** + * parse configuration of auto tuning from the command + * valid format: PARAM_NAME|PARAM_TYPE|VALUE_TYPE|PARAM_RANGE|OPTIONS, multiple params are separated by # + * example: ml.learn.rate|C|double|0.01,1|linear#ml.learn.decay|D|double|0,0.01,0.1 + */ +object ParamParser { + + val helper = "supported format: PARAM_NAME|PARAM_TYPE|VALUE_TYPE|PARAM_RANGE|OPTIONS, OPTIONS is optional" + val helper_param_type = "param type should be D, C or CA (D means discrete, C means continuous, and CA means categorical" + val helper_value_type = "value type should be float, double, int or long" + + val INTER_PARAM_SEP = "#" + val INNER_PARAM_SEP = "\\|" + + def parse(input: String): Array[ParamConfig] = { + separateParams(input).map(parseOneParam) + } + + /** + * separate the config command to a set of parameter config + */ + def separateParams(input: String): Array[String] = { + val params = input.split(INTER_PARAM_SEP) + assert(params.nonEmpty, helper) + params + } + + /** + * parse config for each parameter + */ + def parseOneParam(input: String): ParamConfig = { + val configs = input.split(INNER_PARAM_SEP) + println(s"configs: ${configs.mkString(",")}") + assert(configs.size == 4 || configs.size == 5, helper) + val paramName = getParamName(configs) + val paramType = getParamType(configs) + val valueType = getValueType(configs, paramType) + val paramRange = getParamRange(configs, paramType) + val options = getOptions(configs) + new ParamConfig(paramName, paramType, valueType, paramRange, options) + } + + def getParamName(configs: Array[String]): String = configs(0) + + def getParamType(configs: Array[String]): String = { + val paramType = configs(1).toUpperCase + paramType match { + case "D" => "discrete" + case "C" => "continuous" + case "CA" => "categorical" + case _ => throw new AutoMLException(helper_param_type) + } + } + + def getValueType(configs: Array[String], paramType: String): String = { + val valueType = configs(2).toLowerCase + paramType match { + case "discrete" => + assert(Array("float", "double", "int", "long").contains(valueType), helper_value_type) + valueType + case "continuous" => + "double" + case "categorical" => + valueType + } + } + + def getParamRange(configs: Array[String], paramType: String): String = { + paramType match { + case "discrete" => configs(3).mkString("{", "", "}") + case "continuous" => configs(3).mkString("[", "", "]") + // TODO: use categorical specific format + case "categorical" => configs(3) + } + } + + + def getOptions(configs: Array[String]): Option[String] = { + if (configs.size == 4) + None + else + Some(configs(4)) + } + +} + +class ParamConfig(@BeanProperty var paramName: String, + @BeanProperty var paramType: String, + @BeanProperty var valueType: String, + @BeanProperty var paramRange: String, + @BeanProperty var option: Option[String]) diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/parameter/ParamSpace.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/parameter/ParamSpace.scala new file mode 100644 index 0000000..338df9e --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/parameter/ParamSpace.scala @@ -0,0 +1,77 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.parameter + +import com.tencent.angel.spark.automl.utils.AutoMLException + +import scala.reflect.ClassTag + + +/** + * Base class of a single parameter's search space. + * + * @param name : Name of the parameter + */ +abstract class ParamSpace[+T: ClassTag](val name: String, + val doc: String = "param with search space") { + + val pType: String + + val vType: String + + def sample(size: Int): List[T] + + def sampleOne(): T + + def getValues: Array[Double] + + def numValues: Int +} + +object ParamSpace { + + def fromConfigString(name: String, config: String): ParamSpace[Double] = { + val vType = + if (config.trim.startsWith("[") && config.trim.endsWith("]")) + "continuous" + else if (config.trim.startsWith("{") && config.trim.endsWith("}")) + "discrete" + else "none" + vType match { + case "continuous" => ContinuousSpace(name, config) + case "discrete" => DiscreteSpace[Double](name, config) + case _ => throw new AutoMLException(s"auto param config is not supported") + } + } + + def fromConfigString(name: String, config: String, doc: String): ParamSpace[Double] = { + val vType = + if (config.trim.startsWith("[") && config.trim.endsWith("]")) + "continuous" + else if (config.trim.startsWith("{") && config.trim.endsWith("}")) + "discrete" + else "none" + vType match { + case "continuous" => ContinuousSpace(name, config, doc) + case "discrete" => DiscreteSpace[Double](name, config, doc) + case _ => throw new AutoMLException(s"auto param config is not supported") + } + } + +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/solver/Solver.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/solver/Solver.scala new file mode 100644 index 0000000..fe6e8d5 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/solver/Solver.scala @@ -0,0 +1,201 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.solver + +import java.text.SimpleDateFormat +import java.util.Date + +import com.tencent.angel.spark.automl.tuner.TunerParam +import com.tencent.angel.spark.automl.tuner.acquisition.optimizer.{AcqOptimizer, RandomSearch} +import com.tencent.angel.spark.automl.tuner.acquisition.{Acquisition, EI} +import com.tencent.angel.spark.automl.tuner.config.{Configuration, ConfigurationSpace} +import com.tencent.angel.spark.automl.tuner.parameter.{ContinuousSpace, DiscreteSpace, ParamSpace} +import com.tencent.angel.spark.automl.tuner.surrogate._ +import com.tencent.angel.spark.automl.utils.AutoMLException +import org.apache.commons.logging.{Log, LogFactory} +import org.apache.spark.ml.linalg.Vector + +import scala.collection.mutable + +class Solver( + val cs: ConfigurationSpace, + val surrogate: Surrogate, + val acqFuc: Acquisition, + val optimizer: AcqOptimizer, + val surrogateMode: SurrogateMode.Value) { + + val LOG: Log = LogFactory.getLog(classOf[Solver]) + + val PARAM_TYPES: Array[String] = Array("discrete", "continuous") + + val valid: Boolean = { + // ensure grid + surrogateMode match { + case SurrogateMode.GRID => cs.setAllToGrid() + case _ => + } + true + } + + def getHistory(): (Array[Vector], Array[Double]) = (surrogate.preX.toArray, surrogate.preY.toArray) + + def getSurrogate: Surrogate = surrogate + + def addParam(param: ParamSpace[AnyVal]): Unit = { + cs.addParam(param) + } + + def addParam(pName: String, pType: String, vType: String, config: String): Unit = { + pType.toLowerCase match { + case "discrete" => + vType.toLowerCase match { + case "float" => addParam(new DiscreteSpace[Float](pName, config)) + case "double" => addParam(new DiscreteSpace[Double](pName, config)) + case "int" => addParam(new DiscreteSpace[Int](pName, config)) + case "long" => addParam(new DiscreteSpace[Long](pName, config)) + case _ => throw new AutoMLException(s"unsupported value type $vType") + } + case "continuous" => + vType.toLowerCase match { + case "double" => addParam(new ContinuousSpace(pName, config)) + case _ => throw new AutoMLException(s"unsupported value type $vType") + } + case _ => throw new AutoMLException(s"unsupported param type $pType, should be ${PARAM_TYPES.mkString(",")}") + } + } + + def getParamTypes: mutable.Map[String, (String, String)] = cs.paramType + + def getParamType(pName: String): (String, String) = { + cs.getParamType(pName) + } + + /** + * Suggests configurations to evaluate. + */ + def suggest(): Array[Configuration] = { + TunerParam.index += 1 + surrogateMode match { + case SurrogateMode.GP | SurrogateMode.RF => + val acqAndConfig = optimizer.maximize(TunerParam.batchSize) + val now = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss").format(new Date) +// print(s"${TunerParam.index} ${now} suggest configurations: ") +// var i = 0 +// acqAndConfig.foreach { case (acq, config) => +// print(s">> config[${config.getVector.toArray.mkString("(", ",", ")")}], " + +// s"acquisition[$acq] << ") +// } +// println() + acqAndConfig.map(_._2) + case SurrogateMode.RANDOM => + cs.randomSample(TunerParam.batchSize) + case SurrogateMode.GRID => + cs.gridSample(TunerParam.batchSize) + } + } + + /** + * Feed evaluation result to the model + * + * @param configs : More evaluated configurations + * @param Y : More evaluation result + */ + def feed(configs: Array[Configuration], Y: Array[Double]): Unit = { + //println(s"feed ${configs.size} configurations") + if (!configs.isEmpty && !Y.isEmpty) { + if (surrogate.minimize) { + surrogate.update(configs.map(_.getVector), Y.map(-_)) + } + else { + surrogate.update(configs.map(_.getVector), Y) + } + } + cs.addHistories(configs.map(_.getVector)) + } + + def feed(config: Configuration, y: Double): Unit = { + if (surrogate.minimize) + surrogate.update(config.getVector, -y) + else { + surrogate.update(config.getVector, y) + cs.addHistory(config.getVector) + } + } + + def optimal(): (Vector, Double) = surrogate.curBest + + def stop(): Unit = { + surrogate.stop + } +} + +object Solver { + + def apply(cs: ConfigurationSpace, surrogate: Surrogate, acqFuc: Acquisition, optimizer: AcqOptimizer): Solver = { + new Solver(cs, surrogate, acqFuc, optimizer, SurrogateMode.GP) + } + + def apply(cs: ConfigurationSpace): Solver = { + val sur: Surrogate = new GPSurrogate(cs, minimize = true) + val acq: Acquisition = new EI(sur, 0.1f) + val opt: AcqOptimizer = new RandomSearch(acq, cs) + new Solver(cs, sur, acq, opt, SurrogateMode.GP) + } + + def apply(cs: ConfigurationSpace, minimize: Boolean, surrogate: String): Solver = { + val mode = SurrogateMode.fromString(surrogate) + mode match { + case SurrogateMode.GP => + val sur: Surrogate = new GPSurrogate(cs, minimize) + val acq: Acquisition = new EI(sur, 0.1f) + val opt: AcqOptimizer = new RandomSearch(acq, cs) + new Solver(cs, sur, acq, opt, mode) + case SurrogateMode.RF => + val sur: Surrogate = new RFSurrogate(cs, minimize) + val acq: Acquisition = new EI(sur, 0.1f) + val opt: AcqOptimizer = new RandomSearch(acq, cs) + new Solver(cs, sur, acq, opt, mode) + case SurrogateMode.RANDOM => + val sur = new NormalSurrogate(cs, minimize) + val acq = new EI(sur, 0.1f) + val opt = new RandomSearch(acq, cs) + new Solver(cs, sur, acq, opt, mode) + case SurrogateMode.GRID => + val sur = new NormalSurrogate(cs, minimize) + val acq = new EI(sur, 0.1f) + val opt = new RandomSearch(acq, cs) + new Solver(cs, sur, acq, opt, mode) + } + } + + def apply[T <: AnyVal](array: Array[ParamSpace[T]], minimize: Boolean, surrogate: String): Solver = { + val cs: ConfigurationSpace = new ConfigurationSpace("cs") + array.foreach(cs.addParam) + Solver(cs, minimize, surrogate) + } + + def apply(minimize: Boolean): Solver = { + val cs: ConfigurationSpace = new ConfigurationSpace("cs") + val sur: Surrogate = new GPSurrogate(cs, minimize) + val acq: Acquisition = new EI(sur, 0.1f) + val opt: AcqOptimizer = new RandomSearch(acq, cs) + new Solver(cs, sur, acq, opt, SurrogateMode.GP) + } + +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/solver/SolverWithTrail.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/solver/SolverWithTrail.scala new file mode 100644 index 0000000..88a4c9d --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/solver/SolverWithTrail.scala @@ -0,0 +1,46 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.solver + +import com.tencent.angel.spark.automl.tuner.config.Configuration +import com.tencent.angel.spark.automl.tuner.trail.Trail +import org.apache.spark.ml.linalg.Vector + +class SolverWithTrail(val solver: Solver, val trail: Trail) { + + /** + * The main Bayesian optimization loop + * + * @param numIter : Number of Iterations + * @param X : Initial data points that are already evaluated + * @param Y : Initial function values of the already evaluated points + * @return Incumbent and function value of the incumbent + */ + def run(numIter: Int, X: Array[Configuration] = null, Y: Array[Double] = null): (Vector, Double) = { + if (X != null && Y != null && X.size == Y.size) + solver.feed(X, Y) + (0 until numIter).foreach { iter => + println(s"------iteration $iter starts------") + val configs: Array[Configuration] = solver.suggest() + val results: Array[Double] = trail.evaluate(configs) + solver.feed(configs, results) + } + solver.surrogate.curBest + } +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/GPSurrogate.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/GPSurrogate.scala new file mode 100644 index 0000000..5aea4b2 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/GPSurrogate.scala @@ -0,0 +1,86 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.surrogate + +import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} +import com.tencent.angel.spark.automl.tuner.config.ConfigurationSpace +import com.tencent.angel.spark.automl.tuner.kernel.Matern5Iso +import com.tencent.angel.spark.automl.tuner.model.GPModel +import com.tencent.angel.spark.automl.utils.DataUtils +import org.apache.commons.logging.{Log, LogFactory} +import org.apache.spark.ml.linalg.Vector + +class GPSurrogate( + override val cs: ConfigurationSpace, + override val minimize: Boolean = true) + extends Surrogate(cs, minimize) { + + override val LOG: Log = LogFactory.getLog(classOf[RFSurrogate]) + + val covFunc = Matern5Iso() + val initCovParams = BDV(1.0, 1.0) + val initNoiseStdDev = 0.1 + val gpModel: GPModel = GPModel(covFunc, initCovParams, initNoiseStdDev) + + /** + * Train the surrogate on curX and curY. + */ + override def train(): Unit = { + +// var tmpY = preY.toArray.map(x => math.pow(x,6)) +// tmpY = tmpY.map(x => x / tmpY.max) + + val breezeX: BDM[Double] = DataUtils.toBreeze(preX.toArray) + val breezeY: BDV[Double] = DataUtils.toBreeze(preY.toArray) +// val breezeY: BDV[Double] = DataUtils.toBreeze(tmpY) + val success = gpModel.fit(breezeX, breezeY) + if (!success) { + preX.remove(preX.length - 1) + preY.remove(preY.length - 1) + println(s"drop the new configuration owing to convergence failure.") + } + + /*println("Fitted covariance function params:") + println(gpModel.covParams) + println("Fitted noiseStdDev:") + println(gpModel.noiseStdDev) + println("\n")*/ + + } + + /** + * Predict means and variances for a single given X. + * + * @param X + * @return a tuple of (mean, variance) + */ + override def predict(X: Vector): (Double, Double) = { + val breezeX = DataUtils.toBreeze(X).toDenseMatrix + + val pred = gpModel.predict(breezeX) + + //println(s"predict of ${X.toArray.mkString(",")}: mean[${pred(0, 0)}] variance[${pred(0, 1)}]") + + (pred(0, 0), pred(0, 1)) + } + + override def stop(): Unit = { + + } +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/NormalSurrogate.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/NormalSurrogate.scala new file mode 100644 index 0000000..9c974d9 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/NormalSurrogate.scala @@ -0,0 +1,46 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.surrogate + +import com.tencent.angel.spark.automl.tuner.config.ConfigurationSpace +import org.apache.spark.ml.linalg.Vector + + +class NormalSurrogate(override val cs: ConfigurationSpace, + override val minimize: Boolean = true) extends Surrogate(cs, minimize) { + + override def update(X: Array[Vector], Y: Array[Double]): Unit = { + preX ++= X + preY ++= Y + } + + /** + * NormalSurrogate is designed for random-search and grid-search + * Thus it doesn't need train and predict function + */ + override def train(): Unit = {} + + + def predict(X: Vector): (Double, Double) = { + (0.0, 0.0) + } + + override def stop(): Unit = {} + +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/RFSurrogate.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/RFSurrogate.scala new file mode 100644 index 0000000..32e779c --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/RFSurrogate.scala @@ -0,0 +1,96 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.surrogate + +import com.tencent.angel.spark.automl.tuner.config.ConfigurationSpace +import com.tencent.angel.spark.automl.utils.DataUtils +import org.apache.commons.logging.{Log, LogFactory} +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.ml.regression.{DecisionTreeRegressionModel, RandomForestRegressionModel, RandomForestRegressor} +import org.apache.spark.sql.{DataFrame, SparkSession} + +class RFSurrogate( + override val cs: ConfigurationSpace, + override val minimize: Boolean = true) + extends Surrogate(cs, minimize) { + + override val LOG: Log = LogFactory.getLog(classOf[RFSurrogate]) + + var model: RandomForestRegressionModel = _ + val numTrees: Int = 5 + val maxDepth: Int = 2 + + val ss = SparkSession.builder() + .master("local") + .appName("test") + .getOrCreate() + + ss.sparkContext.setLogLevel("ERROR") + + override def train(): Unit = { + + if (preX.size < Math.pow(2, maxDepth - 1)) + return + +// var tmpY = preY.toArray.map(x => math.pow(x,6)) +// tmpY = tmpY.map(x => x / tmpY.max) +// val data: DataFrame = DataUtils.parse(ss, schema, preX.toArray, tmpY) + val data: DataFrame = DataUtils.parse(ss, schema, preX.toArray, preY.toArray) + + val rf = new RandomForestRegressor() + .setLabelCol("label") + .setFeaturesCol("features") + .setNumTrees(numTrees) + .setMaxDepth(maxDepth) + + model = rf.fit(data) + } + + /** + * Predict means and variances for a single given X. + * + * @param X + * @return a tuple of (mean, variance) + */ + override def predict(X: Vector): (Double, Double) = { + + if (preX.size < Math.pow(2, maxDepth - 1)) { + return (0.0, 0.0) + } + + val preds = model.trees.map { tree: DecisionTreeRegressionModel => + val pred = tree.transform(DataUtils.parse(ss, schema, X)) + pred.select("prediction").first().getDouble(0) + } + + //println(s"tree predictions of ${X.toArray.mkString(",")}: ${preds.mkString(",")}") + + val mean: Double = preds.sum / preds.length + val variance = preds.map(x => Math.pow(x - mean, 2)).sum / preds.length + + //println(s"predict of ${X.toArray.mkString(",")}: mean[$mean] variance[$variance]") + + (mean, variance) + } + + override def stop(): Unit = { + ss.stop + } + +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/Surrogate.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/Surrogate.scala new file mode 100644 index 0000000..5339b18 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/Surrogate.scala @@ -0,0 +1,140 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.surrogate + +import com.tencent.angel.spark.automl.tuner.config.ConfigurationSpace +import org.apache.commons.logging.{Log, LogFactory} +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.sql.types.{DataTypes, StructField, StructType} + +import scala.collection.mutable.ArrayBuffer + +/** + * Abstract base class for surrogate model. + * + * @param numParams : Number of parameters in a configuration + */ +abstract class Surrogate( + val cs: ConfigurationSpace, + val minimize: Boolean = true) { + + var fields: ArrayBuffer[StructField] = new ArrayBuffer[StructField]() + fields += DataTypes.createStructField("label", DataTypes.DoubleType, false) + fields += DataTypes.createStructField("features", DataTypes.createArrayType(DataTypes.DoubleType), false) + + val schema: StructType = StructType( + StructField("label", DataTypes.DoubleType, nullable = false) :: + StructField("features", DataTypes.createArrayType(DataTypes.DoubleType), false) :: + Nil) + + val LOG: Log = LogFactory.getLog(classOf[Surrogate]) + + // Previous input data points, (N, D) + var preX: ArrayBuffer[Vector] = new ArrayBuffer[Vector]() + // previous target value, (N, ) + var preY: ArrayBuffer[Double] = new ArrayBuffer[Double]() + + /** + * Train the surrogate on curX and curY. + */ + def train(): Unit + + /** + * Train the surrogate on X and Y. + * + * @param X : (N, D), input data points. + * @param Y : (N, 1), the corresponding target values. + */ + def train(X: Array[Vector], Y: Array[Double]): Unit = { + preX.clear + preY.clear + preX ++ X + preY ++ Y + train + } + + /** + * Update the surrogate with more X and Y. + * + * @param X + * @param Y + */ + def update(X: Array[Vector], Y: Array[Double]): Unit = { + if (!X.isEmpty && !Y.isEmpty) { +// X.zip(Y).foreach(tuple => print(tuple._1, tuple._2)) + preX ++= X + preY ++= Y + train + } + } + + def print(X: Vector, y: Double): Unit = { + println(s"update surrogate with X[${X.toArray.mkString("(", ",", ")")}] " + + s"and Y[${if (minimize) -y else y}]") + } + + def update(X: Vector, y: Double): Unit = { +// print(X, y) + preX += X + preY += y + train + } + + /** + * Predict means and variances for given X. + * + * @param X + * @return tuples of (mean, variance) + */ + def predict(X: Array[Vector]): Array[(Double, Double)] = { + X.map(predict) + } + + /** + * Predict means and variances for a single given X. + * + * @param X + * @return a tuple of (mean, variance) + */ + def predict(X: Vector): (Double, Double) + + def stop(): Unit + + def curBest: (Vector, Double) = { + if (minimize) curMin else curMax + } + + def curMin: (Vector, Double) = { + if (preY.isEmpty) + (null, Double.MaxValue) + else { + val maxIdx: Int = preY.zipWithIndex.max._2 + (preX(maxIdx), -preY(maxIdx)) + } + } + + def curMax: (Vector, Double) = { + if (preY.isEmpty) + (null, Double.MinValue) + else { + val maxIdx: Int = preY.zipWithIndex.max._2 + (preX(maxIdx), preY(maxIdx)) + } + } +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/SurrogateMode.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/SurrogateMode.scala new file mode 100644 index 0000000..b4c0c51 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/surrogate/SurrogateMode.scala @@ -0,0 +1,33 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + +package com.tencent.angel.spark.automl.tuner.surrogate + + +object SurrogateMode extends Enumeration { + + type SurrogateMode = Value + + val GP = Value("GaussianProcess") + val RF = Value("RandomForest") + val RANDOM = Value("Random") + val GRID = Value("Grid") + + def fromString(mode: String): SurrogateMode = { + SurrogateMode.withName(mode) + } +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/trail/TestRunner.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/trail/TestRunner.scala new file mode 100644 index 0000000..d19b263 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/trail/TestRunner.scala @@ -0,0 +1,34 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.trail + +import com.github.fommil.netlib.F2jBLAS +import com.tencent.angel.spark.automl.tuner.config.Configuration + +class TestRunner(config: Configuration) extends TrailRunner(config) { + + override def call(): Double = { + new F2jBLAS().ddot(config.getVector.size, + config.getVector.toDense.values, + 1, + config.getVector.toDense.values, + 1) + } + +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/trail/TestTrail.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/trail/TestTrail.scala new file mode 100644 index 0000000..cec157a --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/trail/TestTrail.scala @@ -0,0 +1,35 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.trail + +import com.github.fommil.netlib.F2jBLAS +import com.tencent.angel.spark.automl.tuner.config.Configuration + +class TestTrail extends Trail { + + override def evaluate(config: Configuration): Double = { + val ret = new F2jBLAS().ddot(config.getVector.size, + config.getVector.toDense.values, + 1, + config.getVector.toDense.values, + 1) + println(s"evaluate ${config.getVector.toArray.mkString("(", ",", ")")}, result $ret") + ret + } +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/trail/Trail.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/trail/Trail.scala new file mode 100644 index 0000000..af8b2e3 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/trail/Trail.scala @@ -0,0 +1,29 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.trail + +import com.tencent.angel.spark.automl.tuner.config.Configuration + +abstract class Trail { + + def evaluate(configs: Array[Configuration]): Array[Double] = configs.map(evaluate) + + def evaluate(config: Configuration): Double + +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/trail/TrailRunner.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/trail/TrailRunner.scala new file mode 100644 index 0000000..5786451 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/tuner/trail/TrailRunner.scala @@ -0,0 +1,32 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.tuner.trail + +import java.util.concurrent.Callable + +import com.tencent.angel.spark.automl.tuner.config.Configuration + +abstract class TrailRunner(var config: Configuration) extends Callable[Double] { + + override def call(): Double + + def setConf(newConf: Configuration): Unit = { + config = newConf + } +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/utils/ArgsUtil.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/utils/ArgsUtil.scala new file mode 100644 index 0000000..af945e0 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/utils/ArgsUtil.scala @@ -0,0 +1,41 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.utils + +import scala.collection.mutable + +object ArgsUtil { + + def parse(args: Array[String]): Map[String, String] = { + val cmdArgs = new mutable.HashMap[String, String]() + println("parsing parameter") + for (arg <- args) { + val sepIdx = arg.indexOf(":") + if (sepIdx != -1) { + val k = arg.substring(0, sepIdx).trim + val v = arg.substring(sepIdx + 1).trim + if (v != "" && v != "Nan" && v != null) { + cmdArgs.put(k, v) + println(s"param $k = $v") + } + } + } + cmdArgs.toMap + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/utils/AutoMLException.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/utils/AutoMLException.scala new file mode 100644 index 0000000..113285d --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/utils/AutoMLException.scala @@ -0,0 +1,19 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ +package com.tencent.angel.spark.automl.utils + +class AutoMLException(msg: String) extends Exception(msg) diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/utils/DataUtils.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/utils/DataUtils.scala new file mode 100644 index 0000000..ace43a3 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/utils/DataUtils.scala @@ -0,0 +1,63 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.utils + +import breeze.linalg.{DenseMatrix => BDM, DenseVector => BDV} +import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vector} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.{DataFrame, SparkSession} + +object DataUtils { + + def parse(ss: SparkSession, + schema: StructType, + X: Array[Vector], + Y: Array[Double]): DataFrame = { + require(X.size == Y.size, + "The size of configurations should be equal to the size of rewards.") + ss.createDataFrame( + Y.zip(X)).toDF("label", "features") + } + + def parse(ss: SparkSession, + schema: StructType, + X: Vector): DataFrame = { + parse(ss, schema, Array(X), Array(0)) + } + + def toBreeze(values: Array[Double]): BDV[Double] = { + new BDV[Double](values) + } + + def toBreeze(vector: Vector): BDV[Double] = vector match { + case sv: SparseVector => new BDV[Double](vector.toDense.values) + case dv: DenseVector => new BDV[Double](dv.values) + } + + def toBreeze(X: Array[Vector]): BDM[Double] = { + val mat = BDM.zeros[Double](X.size, X(0).size) + for (i <- 0 until X.size) { + for (j <- 0 until X(0).size) { + mat(i, j) = X(i)(j) + } + } + mat + } + +} diff --git a/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/utils/Distribution.scala b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/utils/Distribution.scala new file mode 100644 index 0000000..82cf747 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/tencent/angle/spark/automl/utils/Distribution.scala @@ -0,0 +1,30 @@ +/* + * Tencent is pleased to support the open source community by making Angel available. + * + * Copyright (C) 2017-2018 THL A29 Limited, a Tencent company. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in + * compliance with the License. You may obtain a copy of the License at + * + * https://opensource.org/licenses/Apache-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + * + */ + + +package com.tencent.angel.spark.automl.utils + +object Distribution extends Enumeration { + + type Distribution = Value + + val LINEAR = Value("1") + + def checkExists(distribution: String): Boolean = this.values.exists(_.toString == distribution) + + def printAll(): Unit = this.values.foreach(println) +} diff --git a/tools/kal-test/src/main/scala/org/apache/spark/ml/classification/KNNClassifier.scala b/tools/kal-test/src/main/scala/org/apache/spark/ml/classification/KNNClassifier.scala new file mode 100644 index 0000000..2869e4c --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/ml/classification/KNNClassifier.scala @@ -0,0 +1,240 @@ +package org.apache.spark.ml.classification + +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.knn._ +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.HasWeightCol +import org.apache.spark.ml.util.{Identifiable, SchemaUtils} +import org.apache.spark.ml.linalg._ +import org.apache.spark.ml.feature.LabeledPoint +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.types.{DoubleType, StructType} +import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.storage.StorageLevel +import org.apache.spark.SparkException +import org.apache.spark.ml.stat.MultiClassSummarizer + +import scala.collection.mutable.ArrayBuffer + +/** + * [[https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm]] for classification. + * An object is classified by a majority vote of its neighbors, with the object being assigned to + * the class most common among its k nearest neighbors. + */ +class KNNClassifier(override val uid: String) extends ProbabilisticClassifier[Vector, KNNClassifier, KNNClassificationModel] +with KNNParams with HasWeightCol { + + def this() = this(Identifiable.randomUID("knnc")) + + /** @group setParam */ + override def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + /** @group setParam */ + override def setLabelCol(value: String): this.type = { + set(labelCol, value) + + if ($(weightCol).isEmpty) { + set(inputCols, Array(value)) + } else { + set(inputCols, Array(value, $(weightCol))) + } + } + + //fill in default label col + setDefault(inputCols, Array($(labelCol))) + + /** @group setWeight */ + def setWeightCol(value: String): this.type = { + set(weightCol, value) + + if (value.isEmpty) { + set(inputCols, Array($(labelCol))) + } else { + set(inputCols, Array($(labelCol), value)) + } + } + + setDefault(weightCol -> "") + + /** @group setParam */ + def setK(value: Int): this.type = set(k, value) + + /** @group setParam */ + def setTopTreeSize(value: Int): this.type = set(topTreeSize, value) + + /** @group setParam */ + def setTopTreeLeafSize(value: Int): this.type = set(topTreeLeafSize, value) + + /** @group setParam */ + def setSubTreeLeafSize(value: Int): this.type = set(subTreeLeafSize, value) + + /** @group setParam */ + def setBufferSizeSampleSizes(value: Array[Int]): this.type = set(bufferSizeSampleSizes, value) + + /** @group setParam */ + def setBalanceThreshold(value: Double): this.type = set(balanceThreshold, value) + + /** @group setParam */ + def setSeed(value: Long): this.type = set(seed, value) + + override protected def train(dataset: Dataset[_]): KNNClassificationModel = { + // Extract columns from data. If dataset is persisted, do not persist oldDataset. + val instances = extractLabeledPoints(dataset).map { + case LabeledPoint(label: Double, features: Vector) => (label, features) + } + val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE + if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK) + + val labelSummarizer = instances.treeAggregate( + new MultiClassSummarizer)( + seqOp = (c, v) => (c, v) match { + case (labelSummarizer: MultiClassSummarizer, (label: Double, features: Vector)) => + labelSummarizer.add(label) + }, + combOp = (c1, c2) => (c1, c2) match { + case (classSummarizer1: MultiClassSummarizer, classSummarizer2: MultiClassSummarizer) => + classSummarizer1.merge(classSummarizer2) + }) + + val histogram = labelSummarizer.histogram + val numInvalid = labelSummarizer.countInvalid + val numClasses = histogram.length + + if (numInvalid != 0) { + val msg = s"Classification labels should be in {0 to ${numClasses - 1} " + + s"Found $numInvalid invalid labels." + logError(msg) + throw new SparkException(msg) + } + + val knnModel = copyValues(new KNN()).fit(dataset) + knnModel.toNewClassificationModel(uid, numClasses) + } + + override def fit(dataset: Dataset[_]): KNNClassificationModel = { + // Need to overwrite this method because we need to manually overwrite the buffer size + // because it is not supposed to stay the same as the Classifier if user sets it to -1. + transformSchema(dataset.schema, logging = true) + val model = train(dataset) + val bufferSize = model.getBufferSize + copyValues(model.setParent(this)).setBufferSize(bufferSize) + } + + override def copy(extra: ParamMap): KNNClassifier = defaultCopy(extra) +} + +class KNNClassificationModel private[ml]( + override val uid: String, + val topTree: Broadcast[Tree], + val subTrees: RDD[Tree], + val _numClasses: Int + ) extends ProbabilisticClassificationModel[Vector, KNNClassificationModel] +with KNNModelParams with HasWeightCol with Serializable { + require(subTrees.getStorageLevel != StorageLevel.NONE, + "KNNModel is not designed to work with Trees that have not been cached") + + /** @group setParam */ + def setK(value: Int): this.type = set(k, value) + + /** @group setParam */ + def setBufferSize(value: Double): this.type = set(bufferSize, value) + + override def numClasses: Int = _numClasses + + //TODO: This can benefit from DataSet API + override def transform(dataset: Dataset[_]): DataFrame = { + val getWeight: Row => Double = { + if($(weightCol).isEmpty) { + r => 1.0 + } else { + r => r.getDouble(1) + } + } + + val neighborRDD : RDD[(Long, Array[(Row, Double)])] = transform(dataset, topTree, subTrees) + val merged = neighborRDD + .map { + case (id, labelsDists) => + val (labels, _) = labelsDists.unzip + val vector = new Array[Double](numClasses) + var i = 0 + while (i < labels.length) { + vector(labels(i).getDouble(0).toInt) += getWeight(labels(i)) + i += 1 + } + val rawPrediction = Vectors.dense(vector) + lazy val probability = raw2probability(rawPrediction) + lazy val prediction = probability2prediction(probability) + + val values = new ArrayBuffer[Any] + if ($(rawPredictionCol).nonEmpty) { + values.append(rawPrediction) + } + if ($(probabilityCol).nonEmpty) { + values.append(probability) + } + if ($(predictionCol).nonEmpty) { + values.append(prediction) + } + + (id, values) + } + + dataset.sqlContext.createDataFrame( + dataset.toDF().rdd.zipWithIndex().map { case (row, i) => (i, row) } + .leftOuterJoin(merged) //make sure we don't lose any observations + .map { + case (i, (row, values)) => Row.fromSeq(row.toSeq ++ values.get) + }, + transformSchema(dataset.schema) + ) + } + + override def transformSchema(schema: StructType): StructType = { + var transformed = schema + if ($(rawPredictionCol).nonEmpty) { + transformed = SchemaUtils.appendColumn(transformed, $(rawPredictionCol), new VectorUDT) + } + if ($(probabilityCol).nonEmpty) { + transformed = SchemaUtils.appendColumn(transformed, $(probabilityCol), new VectorUDT) + } + if ($(predictionCol).nonEmpty) { + transformed = SchemaUtils.appendColumn(transformed, $(predictionCol), DoubleType) + } + transformed + } + + override def copy(extra: ParamMap): KNNClassificationModel = { + val copied = new KNNClassificationModel(uid, topTree, subTrees, numClasses) + copyValues(copied, extra).setParent(parent) + } + + override protected def raw2probabilityInPlace(rawPrediction: Vector): Vector = { + rawPrediction match { + case dv: DenseVector => + var i = 0 + val size = dv.size + + var sum = 0.0 + while (i < size) { + sum += dv.values(i) + i += 1 + } + + i = 0 + while (i < size) { + dv.values(i) /= sum + i += 1 + } + + dv + case sv: SparseVector => + throw new RuntimeException("Unexpected error in KNNClassificationModel:" + + " raw2probabilitiesInPlace encountered SparseVector") + } + } + + override def predictRaw(features: Vector): Vector = { + throw new SparkException("predictRaw function should not be called directly since kNN prediction is done in distributed fashion. Use transform instead.") + } +} diff --git a/tools/kal-test/src/main/scala/org/apache/spark/ml/clustering/DBSCANRunner.scala b/tools/kal-test/src/main/scala/org/apache/spark/ml/clustering/DBSCANRunner.scala new file mode 100644 index 0000000..0a8d981 --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/ml/clustering/DBSCANRunner.scala @@ -0,0 +1,128 @@ +package org.apache.spark.ml.clustering + +import com.bigdata.utils.Utils + +import org.apache.spark.SparkConf +import org.apache.spark.ml.linalg.{VectorUDT, Vectors} +import org.apache.spark.sql.{Row, SparkSession} +import org.apache.spark.sql.types.{StructField, StructType} +import org.yaml.snakeyaml.{DumperOptions, TypeDescription, Yaml} +import org.yaml.snakeyaml.constructor.Constructor +import org.yaml.snakeyaml.nodes.Tag +import org.yaml.snakeyaml.representer.Representer + +import java.io.{File, FileWriter} +import java.util +import scala.beans.BeanProperty + +class DBSCANConfig extends Serializable { + + @BeanProperty var dbscan: util.HashMap[String, Object] = _ +} + +class DBSParams extends Serializable { + @BeanProperty var numPartitions: Int = _ + @BeanProperty var epsilon: Double = _ + @BeanProperty var minPoints: Int = _ + @BeanProperty var sampleRate: Double = _ + @BeanProperty var dataPath: String = _ + @BeanProperty var datasetName: String = _ + @BeanProperty var datasetCpuName: String = _ + @BeanProperty var isRaw: String = "no" + @BeanProperty var costTime: Double = _ + @BeanProperty var numTotal: util.HashMap[String, Int] = _ + @BeanProperty var algorithmName: String = _ + @BeanProperty var testcaseType: String = _ +} + +object DBSCANRunner { + + def main(args: Array[String]): Unit = { + + try { + val modelConfSplit = args(0).split("-") + val (datasetName, platformName) = (modelConfSplit(0), modelConfSplit(1)) + val dataPath = args(1) + val datasetCpuName = s"${datasetName}_${platformName}" + + val stream = Utils.getStream("conf/ml/dbscan/dbscan.yml") + val representer = new Representer + representer.addClassTag(classOf[DBSParams], Tag.MAP) + val options = new DumperOptions + options.setDefaultFlowStyle(DumperOptions.FlowStyle.BLOCK) + val yaml = new Yaml(new Constructor(classOf[DBSCANConfig]), representer, options) + val description = new TypeDescription(classOf[DBSParams]) + yaml.addTypeDescription(description) + val config: DBSCANConfig = yaml.load(stream).asInstanceOf[DBSCANConfig] + val paramsMap = config.dbscan.get(datasetName).asInstanceOf[util.HashMap[String, Object]] + val params = new DBSParams() + params.setDataPath(dataPath) + params.setDatasetName(datasetName) + params.setDatasetCpuName(datasetCpuName) + params.setAlgorithmName("DBSCAN") + params.setTestcaseType(s"DBSCAN_${datasetName}") + params.setEpsilon(paramsMap.get("epsilon").toString.toDouble) + params.setMinPoints(paramsMap.get("minPoints").toString.toInt) + params.setSampleRate(paramsMap.get("sampleRate").toString.toDouble) + params.setNumPartitions(paramsMap.get("numPartitions").toString.toInt) + + + val conf = new SparkConf().setAppName(s"DBSCAN_${datasetName}_${platformName}") + val spark = SparkSession.builder.config(conf).getOrCreate() + val dbscan = new DBSCANKernel() + val (numTotal, costTime) = dbscan.runJob(spark, params) + params.setNumTotal(numTotal) + params.setCostTime(costTime) + + Utils.checkDirs("report") + val writer = new FileWriter(s"report/${params.testcaseType}_${ + Utils.getDateStrFromUTC("yyyyMMdd_HHmmss", + System.currentTimeMillis()) + }.yml") + yaml.dump(params, writer) + println(s"Exec Successful: costTime: ${params.getCostTime}s") + } catch { + case e: Throwable => + println(s"Exec Failure: ${e.getMessage}") + throw e + } + } +} + +class DBSCANKernel { + def runJob(spark: SparkSession, params: DBSParams): (util.HashMap[String, Int],Double) = { + val startTime = System.currentTimeMillis() + val sc = spark.sparkContext + val inputSchema = StructType( + Seq( + StructField("features", new VectorUDT, false) + ) + ) + val dataRow = sc.textFile(params.dataPath).map(x => Row(Vectors.dense(x.split(" ").map(_.toDouble)))) + val data = spark.createDataFrame(dataRow, inputSchema) + .repartition(params.numPartitions).cache() + + val model = new DBSCAN() + .setEpsilon(params.epsilon) + .setMinPoints(params.minPoints) + .setSampleRate(params.sampleRate) + + val globalClustersDF = model.fitPredict(data) + globalClustersDF.foreachPartition((_:Iterator[Row]) => {}) + + globalClustersDF.cache() + val pointTypeArr = globalClustersDF.select("prediction").collect().map(_(0)).map(x => x.toString.toInt) + val clusterIDArr = globalClustersDF.select("label").collect().map(_(0)).map(x => x.toString.toInt) + val costTime = (System.currentTimeMillis() - startTime) / 1000.0 + val numClusters: Int = clusterIDArr.toSet.size + val numCore: Int = pointTypeArr.count(_ == 0) + val numBorder: Int = pointTypeArr.count(_ == 1) + val numNoise: Int = pointTypeArr.count(_ == -1) + val numTotal = new util.HashMap[String, Int]() + numTotal.put(s"numClusters",numClusters) + numTotal.put(s"numCore",numCore) + numTotal.put(s"numBorder",numBorder) + numTotal.put(s"numNoise",numNoise) + (numTotal,costTime) + } +} \ No newline at end of file diff --git a/tools/kal-test/src/main/scala/org/apache/spark/ml/feature/FeatureEncodingOrigin.scala b/tools/kal-test/src/main/scala/org/apache/spark/ml/feature/FeatureEncodingOrigin.scala new file mode 100644 index 0000000..ac90004 --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/ml/feature/FeatureEncodingOrigin.scala @@ -0,0 +1,165 @@ +// scalastyle:off header.matches +/* + * This file to You under the Apache License, Version 2.0; + * you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +package org.apache.spark.ml.feature + +import java.io.File +import java.net.URI +import java.util.Date + +import scala.collection.JavaConverters._ +import scala.collection.mutable +import scala.collection.mutable.ArrayBuffer +import scala.io.Source + +import com.fasterxml.jackson.databind.ObjectMapper +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.{FileSystem, FileUtil, Path} + +import org.apache.spark.ml.StaticUtils +import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.functions.{col, lit, udf} + +class FeatureEncodingOrigin extends Serializable{ + var mapLoadPath = "" + var dataPath = "" + var outputFilePath = "" + var localSavePath = "" + var encodeColumns = Array[String]() + var numThread = 40 + + def setMapLoadPath(mapLoadPath: String): this.type = { + this.mapLoadPath = mapLoadPath + this + } + + def setDataPath(dataPath: String): this.type = { + this.dataPath = dataPath + this + } + + def setOutputFilePath(outputFilePath: String): this.type = { + this.outputFilePath = outputFilePath + this + } + + def setLocalSavePath(localSavePath: String): this.type = { + this.localSavePath = localSavePath + this + } + + def setEncodeColumns(encodeColumns: String): this.type = { + this.encodeColumns = encodeColumns.split(",") + this + } + + def setNumThread(numThread: Int): this.type = { + this.numThread = numThread + this + } + + def parseJsonToIntMap(json: String): mutable.Map[String, Int] = { + val mapper = new ObjectMapper() + val node = mapper.readValue(json, classOf[java.util.HashMap[String, Int]]) + node.asScala + } + + def loadJsonToString(path: String): String = { + Source.fromFile(path, "utf-8").mkString + } + + def padZero(input: Array[Int], maxLength: Int): Array[Int] = { + if (input.length > maxLength) { + input.dropRight(input.length-maxLength) + } else { + input.++(Array.ofDim[Int](maxLength-input.length)) + } + } + + def transform(input: DataFrame, featureMapKey: String, + featureMap: Map[String, Int], inputCol: String*): DataFrame = { + if (featureMap.isEmpty) { + throw new Exception("featureMap is empty") + } + + val suffixName = "_index" + val transformUDF = udf((maxLengthKey: String, value: String) => { + val transformList = ArrayBuffer[Int]() + if (featureMap.contains(featureMapKey + "," + value)) { + transformList.append(featureMap(featureMapKey + "," + value)) + } else { + // use 1 as feature index if not found + transformList.append(1) + } + + // return the maxLength array + padZero(transformList.toArray, 1) + }) + + var data = input + for (cols <- inputCol) { + data = data.withColumn( + cols + suffixName, + transformUDF( + lit(cols), + col(cols) + )) + } + data + } + + def dirDel(path: File) { + if (!path.exists()) { + return + } + if (path.isFile()) { + path.delete() + return + } + val file: Array[File] = path.listFiles() + for (d <- file) { + dirDel(d) + } + path.delete() + } + + def copyFileToLocal(spark: SparkSession, hdfsPath: String, localPath: String): Unit = { + val localFilePath = new File(localPath) + dirDel(localFilePath) + if (!localFilePath.exists()) { + localFilePath.mkdirs() + } + val fs = FileSystem.get(new URI(hdfsPath), new Configuration()) + val fileArray = FileUtil.stat2Paths(fs.listStatus(new Path(hdfsPath))) + val startTime = new Date().getTime + for (cnt <- 1 to fileArray.length) { + fs.copyToLocalFile(fileArray(cnt-1), new Path(localPath)) + } + } + + def execute(dataset: DataFrame = null): Unit = { + require(mapLoadPath.nonEmpty, "mapLoadPath is empty") + require(dataPath.nonEmpty, "dataPath is empty") + require(outputFilePath.nonEmpty, "outputFilePath is empty") + require(localSavePath.nonEmpty, "localSavePath is empty") + require(numThread > 0, "numThread is illegal") + val featureMap = parseJsonToIntMap(loadJsonToString(mapLoadPath)) + var res = dataset + for(feature <- encodeColumns) { + require(res.columns.contains(feature), "non existent encodeColumns: " + feature) + res = transform(res, feature, featureMap.toMap, feature) + } + res + .select(encodeColumns.map{t => col(t + "_index")}: _*) + .write.mode("overwrite") + .save(outputFilePath) + + copyFileToLocal(res.sparkSession, outputFilePath, localSavePath + "encode") + copyFileToLocal(res.sparkSession, dataPath, localSavePath + "data") + } +} diff --git a/tools/kal-test/src/main/scala/org/apache/spark/ml/knn/KNN.scala b/tools/kal-test/src/main/scala/org/apache/spark/ml/knn/KNN.scala new file mode 100644 index 0000000..1907360 --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/ml/knn/KNN.scala @@ -0,0 +1,588 @@ +package org.apache.spark.ml.knn + +import breeze.linalg.{DenseVector, Vector => BV} +import breeze.stats._ +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.classification.KNNClassificationModel +import org.apache.spark.ml.knn.KNN.{KNNPartitioner, RowWithVector, VectorWithNorm} +import org.apache.spark.ml.param._ +import org.apache.spark.ml.param.shared._ +import org.apache.spark.ml.regression.KNNRegressionModel +import org.apache.spark.ml.util._ +import org.apache.spark.ml.{Estimator, Model} +import org.apache.spark.ml.linalg.{Vector, VectorUDT, Vectors} +import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ +import org.apache.spark.rdd.{RDD, ShuffledRDD} +import org.apache.spark.sql.types._ +import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.storage.StorageLevel +import org.apache.spark.util.random.XORShiftRandom +import org.apache.spark.{HashPartitioner, Partitioner} +import org.apache.log4j +import org.apache.spark.mllib.knn.KNNUtils + +import scala.annotation.tailrec +import scala.collection.mutable.ArrayBuffer +import scala.util.hashing.byteswap64 + +// features column => vector, input columns => auxiliary columns to return by KNN model +private[ml] trait KNNModelParams extends Params with HasFeaturesCol with HasInputCols { + /** + * Param for the column name for returned neighbors. + * Default: "neighbors" + * + * @group param + */ + val neighborsCol = new Param[String](this, "neighborsCol", "column names for returned neighbors") + + /** @group getParam */ + def getNeighborsCol: String = $(neighborsCol) + + /** + * Param for distance column that will create a distance column of each nearest neighbor + * Default: no distance column will be used + * + * @group param + */ + val distanceCol = new Param[String](this, "distanceCol", "column that includes each neighbors' distance as an additional column") + + /** @group getParam */ + def getDistanceCol: String = $(distanceCol) + + /** + * Param for number of neighbors to find (> 0). + * Default: 5 + * + * @group param + */ + val k = new IntParam(this, "k", "number of neighbors to find", ParamValidators.gt(0)) + + /** @group getParam */ + def getK: Int = $(k) + + /** + * Param for maximum distance to find neighbors + * Default: Double.PositiveInfinity + * + * @group param + */ + val maxDistance = new DoubleParam(this, "maxNeighbors", "maximum distance to find neighbors", // todo: maxDistance or maxNeighbors? + ParamValidators.gt(0)) + + /** @group getParam */ + def getMaxDistance: Double = $(maxDistance) + + /** + * Param for size of buffer used to construct spill trees and top-level tree search. + * Note the buffer size is 2 * tau as described in the paper. + * + * When buffer size is 0.0, the tree itself reverts to a metric tree. + * -1.0 triggers automatic effective nearest neighbor distance estimation. + * + * Default: -1.0 + * + * @group param + */ + val bufferSize = new DoubleParam(this, "bufferSize", + "size of buffer used to construct spill trees and top-level tree search", ParamValidators.gtEq(-1.0)) + + /** @group getParam */ + def getBufferSize: Double = $(bufferSize) + + private[ml] def transform(data: RDD[Vector], topTree: Broadcast[Tree], subTrees: RDD[Tree]): RDD[(Long, Array[(Row,Double)])] = { + val searchData = data.zipWithIndex() + .flatMap { + case (vector, index) => + val vectorWithNorm = new VectorWithNorm(vector) + val idx = KNN.searchIndices(vectorWithNorm, topTree.value, $(bufferSize)) + .map(i => (i, (vectorWithNorm, index))) + + assert(idx.nonEmpty, s"indices must be non-empty: $vector ($index)") + idx + } + .partitionBy(new HashPartitioner(subTrees.partitions.length)) + + // for each partition, search points within corresponding child tree + val results = searchData.zipPartitions(subTrees) { + (childData, trees) => + val tree = trees.next() + assert(!trees.hasNext) + childData.flatMap { + case (_, (point, i)) => + tree.query(point, $(k)).collect { + case (neighbor, distance) if distance <= $(maxDistance) => + (i, (neighbor.row, distance)) + } + } + } + + // merge results by point index together and keep topK results + results.topByKey($(k))(Ordering.by(-_._2)) + .map { case (i, seq) => (i, seq) } + } + + private[ml] def transform(dataset: Dataset[_], topTree: Broadcast[Tree], subTrees: RDD[Tree]): RDD[(Long, Array[(Row, Double)])] = { + transform(dataset.select($(featuresCol)).rdd.map(_.getAs[Vector](0)), topTree, subTrees) + } + +} + +private[ml] trait KNNParams extends KNNModelParams with HasSeed { + /** + * Param for number of points to sample for top-level tree (> 0). + * Default: 1000 + * + * @group param + */ + val topTreeSize = new IntParam(this, "topTreeSize", "number of points to sample for top-level tree", ParamValidators.gt(0)) + + /** @group getParam */ + def getTopTreeSize: Int = $(topTreeSize) + + /** + * Param for number of points at which to switch to brute-force for top-level tree (> 0). + * Default: 5 + * + * @group param + */ + val topTreeLeafSize = new IntParam(this, "topTreeLeafSize", + "number of points at which to switch to brute-force for top-level tree", ParamValidators.gt(0)) + + /** @group getParam */ + def getTopTreeLeafSize: Int = $(topTreeLeafSize) + + /** + * Param for number of points at which to switch to brute-force for distributed sub-trees (> 0). + * Default: 20 + * + * @group param + */ + val subTreeLeafSize = new IntParam(this, "subTreeLeafSize", + "number of points at which to switch to brute-force for distributed sub-trees", ParamValidators.gt(0)) + + /** @group getParam */ + def getSubTreeLeafSize: Int = $(subTreeLeafSize) + + /** + * Param for number of sample sizes to take when estimating buffer size (at least two samples). + * Default: 100 to 1000 by 100 + * + * @group param + */ + val bufferSizeSampleSizes = new IntArrayParam(this, "bufferSizeSampleSize", // todo: should this have an 's' at the end? + "number of sample sizes to take when estimating buffer size", { arr: Array[Int] => arr.length > 1 && arr.forall(_ > 0) }) + + /** @group getParam */ + def getBufferSizeSampleSizes: Array[Int] = $(bufferSizeSampleSizes) + + /** + * Param for fraction of total points at which spill tree reverts back to metric tree + * if either child contains more points (0 <= rho <= 1). + * Default: 70% + * + * @group param + */ + val balanceThreshold = new DoubleParam(this, "balanceThreshold", + "fraction of total points at which spill tree reverts back to metric tree if either child contains more points", + ParamValidators.inRange(0, 1)) + + /** @group getParam */ + def getBalanceThreshold: Double = $(balanceThreshold) + + setDefault(topTreeSize -> 1000, topTreeLeafSize -> 10, subTreeLeafSize -> 30, + bufferSize -> -1.0, bufferSizeSampleSizes -> (100 to 1000 by 100).toArray, balanceThreshold -> 0.7, + k -> 5, neighborsCol -> "neighbors", distanceCol -> "", maxDistance -> Double.PositiveInfinity) + + /** + * Validates and transforms the input schema. + * + * @param schema input schema + * @return output schema + */ + protected def validateAndTransformSchema(schema: StructType): StructType = { + SchemaUtils.checkColumnType(schema, $(featuresCol), new VectorUDT) + val auxFeatures = $(inputCols).map(c => schema(c)) + val schemaWithNeighbors = SchemaUtils.appendColumn(schema, $(neighborsCol), ArrayType(StructType(auxFeatures))) + + if ($(distanceCol).isEmpty) { + schemaWithNeighbors + } else { + SchemaUtils.appendColumn(schemaWithNeighbors, $(distanceCol), ArrayType(DoubleType)) + } + } +} + +/** + * kNN Model facilitates k-Nestrest Neighbor search by storing distributed hybrid spill tree. + * Top level tree is a MetricTree but instead of using back tracking, it searches all possible leaves in parallel + * to avoid multiple iterations. It uses the same buffer size that is used in model training, when the search + * vector falls into the buffer zone of the node, it dispatches search to both children. + * + * A high level overview of the search phases is as follows: + * + * 1. For each vector to search, go through the top level tree to output a pair of (index, point) + * 1. Repartition search points by partition index + * 1. Search each point through the hybrid spill tree in that particular partition + * 1. For each point, merge results from different partitions and keep top k results. + * + */ +class KNNModel private[ml]( + override val uid: String, + val topTree: Broadcast[Tree], + val subTrees: RDD[Tree] + ) extends Model[KNNModel] with KNNModelParams { + require(subTrees.getStorageLevel != StorageLevel.NONE, + "KNNModel is not designed to work with Trees that have not been cached") + + /** @group setParam */ + def setNeighborsCol(value: String): this.type = set(neighborsCol, value) + + /** @group setParam */ + def setDistanceCol(value: String): this.type = set(distanceCol, value) + + /** @group setParam */ + def setK(value: Int): this.type = set(k, value) + + /** @group setParam */ + def setMaxDistance(value: Double): this.type = set(maxDistance, value) + + /** @group setParam */ + def setBufferSize(value: Double): this.type = set(bufferSize, value) + + //TODO: All these can benefit from DataSet API + override def transform(dataset: Dataset[_]): DataFrame = { + val merged: RDD[(Long, Array[(Row,Double)])] = transform(dataset, topTree, subTrees) + + val withDistance = $(distanceCol).nonEmpty + + dataset.sqlContext.createDataFrame( + dataset.toDF().rdd.zipWithIndex().map { case (row, i) => (i, row) } + .leftOuterJoin(merged) + .map { + case (i, (row, neighborsAndDistances)) => + val (neighbors, distances) = neighborsAndDistances.map(_.unzip).getOrElse((Array.empty[Row], Array.empty[Double])) + if (withDistance) { + Row.fromSeq(row.toSeq :+ neighbors :+ distances) + } else { + Row.fromSeq(row.toSeq :+ neighbors) + } + }, + transformSchema(dataset.schema) + ) + } + + override def transformSchema(schema: StructType): StructType = { + val auxFeatures = $(inputCols).map(c => schema(c)) + val schemaWithNeighbors = SchemaUtils.appendColumn(schema, $(neighborsCol), ArrayType(StructType(auxFeatures))) + if ($(distanceCol).isEmpty) { + schemaWithNeighbors + } else { + SchemaUtils.appendColumn(schemaWithNeighbors, $(distanceCol), ArrayType(DoubleType)) + } + } + + override def copy(extra: ParamMap): KNNModel = { + val copied = new KNNModel(uid, topTree, subTrees) + copyValues(copied, extra).setParent(parent) + } + + def toNewClassificationModel(uid: String, numClasses: Int): KNNClassificationModel = { + copyValues(new KNNClassificationModel(uid, topTree, subTrees, numClasses)) + } + + def toNewRegressionModel(uid: String): KNNRegressionModel = { + copyValues(new KNNRegressionModel(uid, topTree, subTrees)) + } +} + +/** + * k-Nearest Neighbors (kNN) algorithm + * + * kNN finds k closest observations in training dataset. It can be used for both classification and regression. + * Furthermore it can also be used for other purposes such as input to clustering algorithm. + * + * While the brute-force approach requires no pre-training, each prediction requires going through the entire training + * set resulting O(n log(k)) runtime per individual prediction using a heap keep track of neighbor candidates. + * Many different implementations have been proposed such as Locality Sensitive Hashing (LSH), KD-Tree, Metric Tree and etc. + * Each algorithm has its shortcomings that prevent them to be effective on large-scale and/or high-dimensional dataset. + * + * This is an implementation of kNN based upon distributed Hybrid Spill-Trees where training points are organized into + * distributed binary trees. The algorithm is designed to support accurate approximate kNN search but by tuning parameters + * an exact search can also be performed with cost of additional runtime. + * + * Each binary tree node is either a + * + * '''Metric Node''': + * Metric Node partition points exclusively into two children by finding two pivot points and divide by middle plane. + * When searched, the child whose pivot is closer to query vector is searched first. Back tracking is required to + * ensure accuracy in this case, where the other child should be searched if it can possibly contain better neighbor + * based upon candidates picked during previous search. + * + * '''Spill Node''': + * Spill Node also partitions points into two children however there are an overlapping buffer between the two pivot + * points. The larger the buffer size, the less effective the node eliminates points thus could increase tree height. + * When searched, defeatist search is used where only one child is searched and no back tracking happens in this + * process. Because of the buffer between two children, we are likely to end up with good enough candidates without + * searching the other part of the tree. + * + * While Spill Node promises O(h) runtime where h is the tree height, the tree is deeper than Metric Tree's O(log n) + * height on average. Furthermore, when it comes down to leaves where points are more closer to each other, the static + * buffer size means more points will end up in the buffer. Therefore a Balance Threshold (rho) is introduced: when + * either child of Spill Node makes up more than rho fraction of the total points at this level, Spill Node is reverted + * back to a Metric Node. + * + * A high level overview of the algorithm is as follows: + * + * 1. Sample M data points (M is relatively small and can be held in driver) + * 1. Build the top level metric tree + * 1. Repartition RDD by assigning each point to leaf node of the above tree + * 1. Build a hybrid spill tree at each partition + * + * This concludes the training phase of kNN. + * See [[KNNModel]] for details on prediction phase. + * + * + * This algorithm is described in [[http://dx.doi.org/10.1109/WACV.2007.18]] where it was shown to scale well in terms of + * number of observations and dimensions, bounded by the available memory across clusters (billions in paper's example). + * This implementation adapts the MapReduce algorithm to work with Spark. + * + */ +class KNN(override val uid: String) extends Estimator[KNNModel] with KNNParams { + def this() = this(Identifiable.randomUID("knn")) + + /** @group setParam */ + def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + /** @group setParam */ + def setK(value: Int): this.type = set(k, value) + + /** @group setParam */ + def setAuxCols(value: Array[String]): this.type = set(inputCols, value) + + /** @group setParam */ + def setTopTreeSize(value: Int): this.type = set(topTreeSize, value) + + /** @group setParam */ + def setTopTreeLeafSize(value: Int): this.type = set(topTreeLeafSize, value) + + /** @group setParam */ + def setSubTreeLeafSize(value: Int): this.type = set(subTreeLeafSize, value) + + /** @group setParam */ + def setBufferSizeSampleSizes(value: Array[Int]): this.type = set(bufferSizeSampleSizes, value) + + /** @group setParam */ + def setBalanceThreshold(value: Double): this.type = set(balanceThreshold, value) + + /** @group setParam */ + def setSeed(value: Long): this.type = set(seed, value) + + override def fit(dataset: Dataset[_]): KNNModel = { + val rand = new XORShiftRandom($(seed)) + //prepare data for model estimation + val data = dataset.selectExpr($(featuresCol), $(inputCols).mkString("struct(", ",", ")")) + .rdd + .map(row => new RowWithVector(row.getAs[Vector](0), row.getStruct(1))) + //sample data to build top-level tree + val sampled = data.sample(withReplacement = false, $(topTreeSize).toDouble / dataset.count(), rand.nextLong()).collect() + val topTree = MetricTree.build(sampled, $(topTreeLeafSize), rand.nextLong()) + //build partitioner using top-level tree + val part = new KNNPartitioner(topTree) + //noinspection ScalaStyle + val repartitioned = new ShuffledRDD[RowWithVector, Null, Null](data.map(v => (v, null)), part).keys + + val tau = + if ($(balanceThreshold) > 0 && $(bufferSize) < 0) { + KNN.estimateTau(data, $(bufferSizeSampleSizes), rand.nextLong()) + } else { + math.max(0, $(bufferSize)) + } + logInfo("Tau is: " + tau) + + val trees = repartitioned.mapPartitionsWithIndex { + (partitionId, itr) => + val rand = new XORShiftRandom(byteswap64($(seed) ^ partitionId)) + val childTree = + HybridTree.build(itr.toIndexedSeq, $(subTreeLeafSize), tau, $(balanceThreshold), rand.nextLong()) + + Iterator(childTree) + }.persist(StorageLevel.MEMORY_AND_DISK) + // TODO: force persisting trees primarily for benchmark. any reason not to do this for regular runs? + trees.count() + + val model = new KNNModel(uid, trees.context.broadcast(topTree), trees).setParent(this) + copyValues(model).setBufferSize(tau) + } + + override def transformSchema(schema: StructType): StructType = { + validateAndTransformSchema(schema) + } + + override def copy(extra: ParamMap): KNN = defaultCopy(extra) +} + + +object KNN { + + val logger = log4j.Logger.getLogger(classOf[KNN]) + + /** + * VectorWithNorm can use more efficient algorithm to calculate distance + */ + case class VectorWithNorm(vector: Vector, norm: Double) { + def this(vector: Vector) = this(vector, Vectors.norm(vector, 2)) + + def this(vector: BV[Double]) = this(Vectors.fromBreeze(vector)) + + def fastSquaredDistance(v: VectorWithNorm): Double = { + KNNUtils.fastSquaredDistance(vector, norm, v.vector, v.norm) + } + + def fastDistance(v: VectorWithNorm): Double = math.sqrt(fastSquaredDistance(v)) + } + + /** + * VectorWithNorm plus auxiliary row information + */ + case class RowWithVector(vector: VectorWithNorm, row: Row) { + def this(vector: Vector, row: Row) = this(new VectorWithNorm(vector), row) + } + + /** + * Estimate a suitable buffer size based on dataset + * + * A suitable buffer size is the minimum size such that nearest neighbors can be accurately found even at + * boundary of splitting plane between pivot points. Therefore assuming points are uniformly distributed in + * high dimensional space, it should be approximately the average distance between points. + * + * Specifically the number of points within a certain radius of a given point is proportionally to the density of + * points raised to the effective number of dimensions, of which manifold data points exist on: + * R_s = \frac{c}{N_s ** 1/d} + * where R_s is the radius, N_s is the number of points, d is effective number of dimension, and c is a constant. + * + * To estimate R_s_all for entire dataset, we can take samples of the dataset of different size N_s to compute R_s. + * We can estimate c and d using linear regression. Lastly we can calculate R_s_all using total number of observation + * in dataset. + * + */ + def estimateTau(data: RDD[RowWithVector], sampleSize: Array[Int], seed: Long): Double = { + val total = data.count() + + // take samples of points for estimation + val samples = data.mapPartitionsWithIndex { + case (partitionId, itr) => + val rand = new XORShiftRandom(byteswap64(seed ^ partitionId)) + itr.flatMap { + p => sampleSize.zipWithIndex + .filter { case (size, _) => rand.nextDouble() * total < size } + .map { case (size, index) => (index, p) } + } + } + // compute N_s and R_s pairs + val estimators = samples + .groupByKey() + .map { + case (index, points) => (points.size, computeAverageDistance(points)) + }.collect().distinct + + // collect x and y vectors + val x = DenseVector(estimators.map { case (n, _) => math.log(n) }) + val y = DenseVector(estimators.map { case (_, d) => math.log(d) }) + + // estimate log(R_s) = alpha + beta * log(N_s) + val xMeanVariance = meanAndVariance(x) + val xmean = xMeanVariance.mean + val yMeanVariance = meanAndVariance(y) + val ymean = yMeanVariance.mean + + val corr = (mean(x *:* y) - xmean * ymean) / math.sqrt((mean(x *:* x) - xmean * xmean) * (mean(y *:* y) - ymean * ymean)) + + val beta = corr * yMeanVariance.stdDev / xMeanVariance.stdDev + val alpha = ymean - beta * xmean + val rs = math.exp(alpha + beta * math.log(total)) + + if (beta > 0 || beta.isNaN || rs.isNaN) { + val yMax = breeze.linalg.max(y) + logger.error( + s"""Unable to estimate Tau with positive beta: $beta. This maybe because data is too small. + |Setting to $yMax which is the maximum average distance we found in the sample. + |This may leads to poor accuracy. Consider manually set bufferSize instead. + |You can also try setting balanceThreshold to zero so only metric trees are built.""".stripMargin) + yMax + } else { + // c = alpha, d = - 1 / beta + rs / math.sqrt(-1 / beta) + } + } + + // compute the average distance of nearest neighbors within points using brute-force + private[this] def computeAverageDistance(points: Iterable[RowWithVector]): Double = { + val distances = points.map { + point => points.map(p => p.vector.fastSquaredDistance(point.vector)).filter(_ > 0).min + }.map(math.sqrt) + + distances.sum / distances.size + } + + /** + * Search leaf index used by KNNPartitioner to partition training points + * + * @param v one training point to partition + * @param tree top tree constructed using sampled points + * @param acc accumulator used to help determining leaf index + * @return leaf/partition index + */ + @tailrec + private[knn] def searchIndex(v: RowWithVector, tree: Tree, acc: Int = 0): Int = { + tree match { + case node: MetricTree => + val leftDistance = node.leftPivot.fastSquaredDistance(v.vector) + val rightDistance = node.rightPivot.fastSquaredDistance(v.vector) + if (leftDistance < rightDistance) { + searchIndex(v, node.leftChild, acc) + } else { + searchIndex(v, node.rightChild, acc + node.leftChild.leafCount) + } + case _ => acc // reached leaf + } + } + + //TODO: Might want to make this tail recursive + private[ml] def searchIndices(v: VectorWithNorm, tree: Tree, tau: Double, acc: Int = 0): Seq[Int] = { + tree match { + case node: MetricTree => + val leftDistance = node.leftPivot.fastDistance(v) + val rightDistance = node.rightPivot.fastDistance(v) + + val buffer = new ArrayBuffer[Int] + if (leftDistance - rightDistance <= tau) { + buffer ++= searchIndices(v, node.leftChild, tau, acc) + } + + if (rightDistance - leftDistance <= tau) { + buffer ++= searchIndices(v, node.rightChild, tau, acc + node.leftChild.leafCount) + } + + buffer + case _ => Seq(acc) // reached leaf + } + } + + /** + * Partitioner used to map vector to leaf node which determines the partition it goes to + * + * @param tree `Tree` used to find leaf + */ + class KNNPartitioner[T <: RowWithVector](tree: Tree) extends Partitioner { + override def numPartitions: Int = tree.leafCount + + override def getPartition(key: Any): Int = { + key match { + case v: RowWithVector => searchIndex(v, tree) + case _ => throw new IllegalArgumentException(s"Key must be of type Vector but got: $key") + } + } + + } + +} diff --git a/tools/kal-test/src/main/scala/org/apache/spark/ml/knn/MetricTree.scala b/tools/kal-test/src/main/scala/org/apache/spark/ml/knn/MetricTree.scala new file mode 100644 index 0000000..b7e3bd2 --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/ml/knn/MetricTree.scala @@ -0,0 +1,397 @@ +package org.apache.spark.ml.knn + +import breeze.linalg._ +import org.apache.spark.ml.knn.KNN._ +import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.util.random.XORShiftRandom + +import scala.collection.mutable + +/** + * A [[Tree]] is used to store data points used in k-NN search. It represents + * a binary tree node. It keeps track of the pivot vector which closely approximate + * the center of all vectors within the node. All vectors are within the radius of + * distance to the pivot vector. Finally it knows the number of leaves to help + * determining partition index. + */ +private[ml] abstract class Tree extends Serializable { + val leftChild: Tree + val rightChild: Tree + val size: Int + val leafCount: Int + val pivot: VectorWithNorm + val radius: Double + + def iterator: Iterator[RowWithVector] + + /** + * k-NN query using pre-built [[Tree]] + * @param v vector to query + * @param k number of nearest neighbor + * @return a list of neighbor that is nearest to the query vector + */ + def query(v: Vector, k: Int = 1): Iterable[(RowWithVector, Double)] = query(new VectorWithNorm(v), k) + def query(v: VectorWithNorm, k: Int): Iterable[(RowWithVector, Double)] = query(new KNNCandidates(v, k)).toIterable + + /** + * Refine k-NN candidates using data in this [[Tree]] + */ + private[knn] def query(candidates: KNNCandidates): KNNCandidates + + /** + * Compute QueryCost defined as || v.center - q || - r + * when >= v.r node can be pruned + * for MetricNode this can be used to determine which child does queryVector falls into + */ + private[knn] def distance(candidates: KNNCandidates): Double = distance(candidates.queryVector) + + private[knn] def distance(v: VectorWithNorm): Double = + if(pivot.vector.size > 0) pivot.fastDistance(v) else 0.0 +} + +private[knn] +case object Empty extends Tree { + override val leftChild = this + override val rightChild = this + override val size = 0 + override val leafCount = 0 + override val pivot = new VectorWithNorm(Vectors.dense(Array.empty[Double])) + override val radius = 0.0 + + override def iterator: Iterator[RowWithVector] = Iterator.empty + override def query(candidates: KNNCandidates): KNNCandidates = candidates +} + +private[knn] +case class Leaf (data: IndexedSeq[RowWithVector], + pivot: VectorWithNorm, + radius: Double) extends Tree { + override val leftChild = Empty + override val rightChild = Empty + override val size = data.size + override val leafCount = 1 + + override def iterator: Iterator[RowWithVector] = data.iterator + + // brute force k-NN search at the leaf + override def query(candidates: KNNCandidates): KNNCandidates = { + val sorted = data + .map{ v => (v, candidates.queryVector.fastDistance(v.vector)) } + .sortBy(_._2) + + for((v, d) <- sorted if candidates.notFull || d < candidates.maxDistance) + candidates.insert(v, d) + + candidates + } +} + +private[knn] +object Leaf { + def apply(data: IndexedSeq[RowWithVector]): Leaf = { + val vectors = data.map(_.vector.vector.asBreeze) + val (minV, maxV) = vectors.foldLeft((vectors.head, vectors.head)) { + case ((accMin, accMax), bv) => + (min(accMin, bv), max(accMax, bv)) + } + val pivot = new VectorWithNorm((minV + maxV) / 2.0) + val radius = math.sqrt(squaredDistance(minV, maxV)) / 2.0 + Leaf(data, pivot, radius) + } +} + +/** + * A [[MetricTree]] represents a MetricNode where data are split into two partitions: left and right. + * There exists two pivot vectors: leftPivot and rightPivot to determine the partitioning. + * Pivot vector should be the middle of leftPivot and rightPivot vectors. + * Points that is closer to leftPivot than to rightPivot belongs to leftChild and rightChild otherwise. + * + * During search, because we have information about each child's pivot and radius, we can see if the + * hyper-sphere intersects with current candidates sphere. If so, we search the child that has the + * most potential (i.e. the child which has the closest pivot). + * Once that child has been fully searched, we backtrack to the remaining child and search if necessary. + * + * This is much more efficient than naive brute force search. However backtracking can take a lot of time + * when the number of dimension is high (due to longer time to compute distance and the volume growing much + * faster than radius). + */ +private[knn] +case class MetricTree(leftChild: Tree, + leftPivot: VectorWithNorm, + rightChild: Tree, + rightPivot: VectorWithNorm, + pivot: VectorWithNorm, + radius: Double + ) extends Tree { + override val size = leftChild.size + rightChild.size + override val leafCount = leftChild.leafCount + rightChild.leafCount + + override def iterator: Iterator[RowWithVector] = leftChild.iterator ++ rightChild.iterator + override def query(candidates: KNNCandidates): KNNCandidates = { + lazy val leftQueryCost = leftChild.distance(candidates) + lazy val rightQueryCost = rightChild.distance(candidates) + // only query if at least one of the children is worth looking + if(candidates.notFull || + leftQueryCost - candidates.maxDistance < leftChild.radius || + rightQueryCost - candidates.maxDistance < rightChild.radius ){ + val remainingChild = { + if (leftQueryCost <= rightQueryCost) { + leftChild.query(candidates) + rightChild + } else { + rightChild.query(candidates) + leftChild + } + } + // check again to see if the remaining child is still worth looking + if (candidates.notFull || + remainingChild.distance(candidates) - candidates.maxDistance < remainingChild.radius) { + remainingChild.query(candidates) + } + } + candidates + } +} + +object MetricTree { + /** + * Build a (metric)[[Tree]] that facilitate k-NN query + * + * @param data vectors that contain all training data + * @param seed random number generator seed used in pivot point selecting + * @return a [[Tree]] can be used to do k-NN query + */ + def build(data: IndexedSeq[RowWithVector], leafSize: Int = 1, seed: Long = 0L): Tree = { + val size = data.size + if(size == 0) { + Empty + } else if(size <= leafSize) { + Leaf(data) + } else { + val rand = new XORShiftRandom(seed) + val randomPivot = data(rand.nextInt(size)).vector + val leftPivot = data.maxBy(v => randomPivot.fastSquaredDistance(v.vector)).vector + if(leftPivot == randomPivot) { + // all points are identical (or only one point left) + Leaf(data, randomPivot, 0.0) + } else { + val rightPivot = data.maxBy(v => leftPivot.fastSquaredDistance(v.vector)).vector + val pivot = new VectorWithNorm(Vectors.fromBreeze((leftPivot.vector.asBreeze + rightPivot.vector.asBreeze) / 2.0)) + val radius = math.sqrt(data.map(v => pivot.fastSquaredDistance(v.vector)).max) + val (leftPartition, rightPartition) = data.partition{ + v => leftPivot.fastSquaredDistance(v.vector) < rightPivot.fastSquaredDistance(v.vector) + } + + MetricTree( + build(leftPartition, leafSize, rand.nextLong()), + leftPivot, + build(rightPartition, leafSize, rand.nextLong()), + rightPivot, + pivot, + radius + ) + } + } + } +} + +/** + * A [[SpillTree]] represents a SpillNode. Just like [[MetricTree]], it splits data into two partitions. + * However, instead of partition data into exactly two halves, it contains a buffer zone with size of tau. + * Left child contains all data left to the center plane + tau (in the leftPivot -> rightPivot direction). + * Right child contains all data right to the center plane - tau. + * + * Search doesn't do backtracking but rather adopt a defeatist search where it search the most prominent + * child and that child only. The buffer ensures such strategy doesn't result in a poor outcome. + */ +private[knn] +case class SpillTree(leftChild: Tree, + leftPivot: VectorWithNorm, + rightChild: Tree, + rightPivot: VectorWithNorm, + pivot: VectorWithNorm, + radius: Double, + tau: Double, + bufferSize: Int + ) extends Tree { + override val size = leftChild.size + rightChild.size - bufferSize + override val leafCount = leftChild.leafCount + rightChild.leafCount + + override def iterator: Iterator[RowWithVector] = + leftChild.iterator ++ rightChild.iterator.filter(childFilter(leftPivot, rightPivot)) + + override def query(candidates: KNNCandidates): KNNCandidates = { + if (size <= candidates.k - candidates.candidates.size) { + iterator.foreach(candidates.insert) + } else { + val leftQueryCost = candidates.queryVector.fastSquaredDistance(leftPivot) + val rightQueryCost = candidates.queryVector.fastSquaredDistance(rightPivot) + + (if (leftQueryCost <= rightQueryCost) leftChild else rightChild).query(candidates) + + // fill candidates with points from other child excluding buffer so we don't double count. + // depending on K and how high we are in the tree, this can be very expensive and undesirable + // TODO: revisit this idea when we do large scale testing + if(candidates.notFull) { + (if (leftQueryCost <= rightQueryCost) { + rightChild.iterator.filter(childFilter(leftPivot, rightPivot)) + } else { + leftChild.iterator.filter(childFilter(rightPivot, leftPivot)) + }).foreach(candidates.tryInsert) + } + } + candidates + } + + private[this] val childFilter: (VectorWithNorm, VectorWithNorm) => RowWithVector => Boolean = + (p1, p2) => p => p.vector.fastDistance(p1) - p.vector.fastDistance(p2) > tau +} + + +object SpillTree { + /** + * Build a (spill)[[Tree]] that facilitate k-NN query + * + * @param data vectors that contain all training data + * @param tau overlapping size + * @param seed random number generators seed used in pivot point selecting + * @return a [[Tree]] can be used to do k-NN query + */ + def build(data: IndexedSeq[RowWithVector], leafSize: Int = 1, tau: Double, seed: Long = 0L): Tree = { + val size = data.size + if (size == 0) { + Empty + } else if (size <= leafSize) { + Leaf(data) + } else { + val rand = new XORShiftRandom(seed) + val randomPivot = data(rand.nextInt(size)).vector + val leftPivot = data.maxBy(v => randomPivot.fastSquaredDistance(v.vector)).vector + if (leftPivot == randomPivot) { + // all points are identical (or only one point left) + Leaf(data, randomPivot, 0.0) + } else { + val rightPivot = data.maxBy(v => leftPivot.fastSquaredDistance(v.vector)).vector + val pivot = new VectorWithNorm(Vectors.fromBreeze((leftPivot.vector.asBreeze + rightPivot.vector.asBreeze) / 2.0)) + val radius = math.sqrt(data.map(v => pivot.fastSquaredDistance(v.vector)).max) + val dataWithDistance = data.map(v => + (v, leftPivot.fastDistance(v.vector), rightPivot.fastDistance(v.vector)) + ) + val leftPartition = dataWithDistance.filter { case (_, left, right) => left - right <= tau }.map(_._1) + val rightPartition = dataWithDistance.filter { case (_, left, right) => right - left <= tau }.map(_._1) + + SpillTree( + build(leftPartition, leafSize, tau, rand.nextLong()), + leftPivot, + build(rightPartition, leafSize, tau, rand.nextLong()), + rightPivot, + pivot, + radius, + tau, + leftPartition.size + rightPartition.size - size + ) + } + } + } +} + +object HybridTree { + /** + * Build a (hybrid-spill) `Tree` that facilitate k-NN query + * + * @param data vectors that contain all training data + * @param seed random number generator seed used in pivot point selecting + * @param tau overlapping size + * @param rho balance threshold + * @return a `Tree` can be used to do k-NN query + */ + //noinspection ScalaStyle + def build(data: IndexedSeq[RowWithVector], + leafSize: Int = 1, + tau: Double, + rho: Double = 0.7, + seed: Long = 0L): Tree = { + val size = data.size + if (size == 0) { + Empty + } else if (size <= leafSize) { + Leaf(data) + } else { + val rand = new XORShiftRandom(seed) + val randomPivot = data(rand.nextInt(size)).vector + val leftPivot = data.maxBy(v => randomPivot.fastSquaredDistance(v.vector)).vector + if (leftPivot == randomPivot) { + // all points are identical (or only one point left) + Leaf(data, randomPivot, 0.0) + } else { + val rightPivot = data.maxBy(v => leftPivot.fastSquaredDistance(v.vector)).vector + val pivot = new VectorWithNorm(Vectors.fromBreeze((leftPivot.vector.asBreeze + rightPivot.vector.asBreeze) / 2.0)) + val radius = math.sqrt(data.map(v => pivot.fastSquaredDistance(v.vector)).max) + lazy val dataWithDistance = data.map(v => + (v, leftPivot.fastDistance(v.vector), rightPivot.fastDistance(v.vector)) + ) + // implemented boundary is parabola (rather than perpendicular plane described in the paper) + lazy val leftPartition = dataWithDistance.filter { case (_, left, right) => left - right <= tau }.map(_._1) + lazy val rightPartition = dataWithDistance.filter { case (_, left, right) => right - left <= tau }.map(_._1) + + if(rho <= 0.0 || leftPartition.size > size * rho || rightPartition.size > size * rho) { + //revert back to metric node + val (leftPartition, rightPartition) = data.partition{ + v => leftPivot.fastSquaredDistance(v.vector) < rightPivot.fastSquaredDistance(v.vector) + } + MetricTree( + build(leftPartition, leafSize, tau, rho, rand.nextLong()), + leftPivot, + build(rightPartition, leafSize, tau, rho, rand.nextLong()), + rightPivot, + pivot, + radius + ) + } else { + SpillTree( + build(leftPartition, leafSize, tau, rho, rand.nextLong()), + leftPivot, + build(rightPartition, leafSize, tau, rho, rand.nextLong()), + rightPivot, + pivot, + radius, + tau, + leftPartition.size + rightPartition.size - size + ) + } + } + } + } +} + +/** + * Structure to maintain search progress/results for a single query vector. + * Internally uses a PriorityQueue to maintain a max-heap to keep track of the + * next neighbor to evict. + * + * @param queryVector vector being searched + * @param k number of neighbors to return + */ +private[knn] +class KNNCandidates(val queryVector: VectorWithNorm, val k: Int) extends Serializable { + private[knn] val candidates = mutable.PriorityQueue.empty[(RowWithVector, Double)] { + Ordering.by(_._2) + } + + // return the current maximum distance from neighbor to search vector + def maxDistance: Double = if(candidates.isEmpty) 0.0 else candidates.head._2 + // insert evict neighbor if required. however it doesn't make sure the insert improves + // search results. it is caller's responsibility to make sure either candidate list + // is not full or the inserted neighbor brings the maxDistance down + def insert(v: RowWithVector, d: Double): Unit = { + while(candidates.size >= k) candidates.dequeue() + candidates.enqueue((v, d)) + } + def insert(v: RowWithVector): Unit = insert(v, v.vector.fastDistance(queryVector)) + def tryInsert(v: RowWithVector): Unit = { + val distance = v.vector.fastDistance(queryVector) + if(notFull || distance < maxDistance) insert(v, distance) + } + def toIterable: Iterable[(RowWithVector, Double)] = candidates + def notFull: Boolean = candidates.size < k +} diff --git a/tools/kal-test/src/main/scala/org/apache/spark/ml/recommendation/SimRankOpenSource.scala b/tools/kal-test/src/main/scala/org/apache/spark/ml/recommendation/SimRankOpenSource.scala new file mode 100644 index 0000000..8d08e47 --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/ml/recommendation/SimRankOpenSource.scala @@ -0,0 +1,151 @@ +// scalastyle:off +package org.apache.spark.ml.recommendation + +import org.apache.spark.{Dependency, ShuffleDependency, SparkContext} +import org.apache.spark.graphx._ +import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry} +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.types.{StringType, StructField, StructType} +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.storage.StorageLevel +import org.slf4j.LoggerFactory + +/** + * simrank + * + */ +class SimRankOpenSource() { + + private val PARAM_NAME_DAMP = "damp" + private val INPUT_NAME_NODES_INFO = "nodes" + private val PARAM_NAME_MAX_ITERATOR = "maxIterator" + + def execute(input: DataFrame, colNames: (String, String), damp: Double, maxIter: Int): (DataFrame, DataFrame) = { + import input.sparkSession.implicits._ + val nodes = input.select(colNames._1, colNames._2).distinct().map(one => (one.getString(0), one.getString(1))).rdd.cache() + val sim = SimRankOpenSource.getSimilarity(nodes, damp, maxIter) + + (sim._1.toDF(colNames._1 + "1", colNames._1 + "2", "similarity"), sim._2.toDF(colNames._2 + "1", colNames._2 + "2", "similarity")) + } + +} + +class SimRankGraphOpenSource() { + /** + * 创建图的结构 + * + * @param node2Index 节点idx信息 + * @param nodes 用户商品二部图 + * @return + */ + def graphStruct(node2Index: RDD[(String, Long)], nodes: RDD[(String, String)]): Graph[String, Int] = { + + val indexedNodes = nodes.join(node2Index).map(r => (r._2._1, r._2._2)).join(node2Index).map(r => (r._2._1, r._2._2)) + + val relationShips: RDD[Edge[Int]] = indexedNodes.map { x => + val x1 = x._1 + val x2 = x._2 + Edge(x1, x2, 1) + } + val users: RDD[(VertexId, String)] = node2Index.map { x => + (x._2, x._1) + } + + val graph = Graph(users, relationShips) + graph + } +} + +object SimRankOpenSource { + private val LOGGER = LoggerFactory.getLogger(this.getClass) + + /** + * 获取item相似图 + * + * @param nodes 用户商品节点信息[device_id, goods_id] + * @param damp 阻尼系数 + */ + def getSimilarity(nodes: RDD[(String, String)], + damp: Double, maxIte: Int): (RDD[(String, String, Double)], RDD[(String, String, Double)]) = { + val itemSet = nodes.map(x => (x._2, "-")).distinct() + val userSet = nodes.map(x => (x._1, "-")).distinct() + // 用户商品index化 + val node2IndexArray = (nodes.map(_._1) union nodes.map(_._2)).distinct.collect().zipWithIndex.map(one => (one._1, one._2.toLong)) + val nodesNum = node2IndexArray.length + val node2Index = nodes.sparkContext.parallelize(node2IndexArray) + + val graph = new SimRankGraphOpenSource().graphStruct(node2Index, nodes) + val outs = graph.outDegrees.map(x => (x._1, 1 / x._2.toDouble)) + val ins = graph.inDegrees.map(x => (x._1, 1 / x._2.toDouble)) + val rdd_out = graph.outerJoinVertices(outs)((id, _, degin) => (id.toString, degin.getOrElse(0))).triplets.map { x => + (x.dstId, x.srcId, x.srcAttr._2.toString.toDouble * x.attr) + } + val rdd_int = graph.outerJoinVertices(ins)((id, _, degin) => (id.toString, degin.getOrElse(0))).triplets.map { x => + (x.srcId, x.dstId, x.dstAttr._2.toString.toDouble * x.attr) + } + + val rdd_all = rdd_out.union(rdd_int) + + val blockSize = 2048 + val transferMatrix = new CoordinateMatrix(rdd_all.map { x => + MatrixEntry(x._1, x._2, x._3) + }.repartition(1000)).toBlockMatrix(blockSize, blockSize).persist(StorageLevel.MEMORY_AND_DISK_SER) + + // 初始化相似度矩阵 + var S_k = new CoordinateMatrix(nodes.sparkContext.parallelize(0 until nodesNum).map { x => + MatrixEntry(x, x, 1.0) + }.repartition(1000)) + // K+1次迭代相似度矩阵 + var S_kp1 = S_k.entries + + for (i <- 0 until maxIte) { + S_kp1 = transferMatrix.transpose.multiply(S_k.toBlockMatrix(blockSize, blockSize)) + .multiply(transferMatrix).toCoordinateMatrix().entries.map(entry => { + if (entry.i == entry.j) { + MatrixEntry(entry.i, entry.j, 1.0) + } else { + MatrixEntry(entry.i, entry.j, entry.value * damp) + } + }).persist(StorageLevel.MEMORY_AND_DISK_SER).setName("S_Kp1") + S_kp1.foreachPartition(_ => {}) + cleanShuffleDependencies(S_kp1.sparkContext, S_kp1.dependencies) + S_k = new CoordinateMatrix(S_kp1, nodesNum, nodesNum) + } + + // RDD[(idx, device_id/goods_id)] + val index2Node = node2Index.map(x => (x._2, x._1)) + + val nodeSim = S_kp1.map(x => (x.i, x.j, x.value)).map(x => (x._1, (x._2, x._3))) + .join(index2Node) + .map(x => (x._2._1._1, (x._2._1._2, x._2._2))) + .join(index2Node) + .map(x => (x._2._1._2, (x._2._2, x._2._1._1))) + val userSim = nodeSim.join(userSet).map(x => (x._1, x._2._1._1, x._2._1._2)).filter(x => !x._1.equals(x._2)) + val itemSim = nodeSim.join(itemSet).map(x => (x._1, x._2._1._1, x._2._1._2)).filter(x => !x._1.equals(x._2)) + (userSim, itemSim) + } + + def cleanShuffleDependencies[T]( + sc: SparkContext, + deps: Seq[Dependency[_]], + blocking: Boolean = false): Unit = { + // If there is no reference tracking we skip clean up. + sc.cleaner.foreach { cleaner => + /** + * Clean the shuffles & all of its parents. + */ + def cleanEagerly(dep: Dependency[_]): Unit = { + if (dep.isInstanceOf[ShuffleDependency[_, _, _]]) { + val shuffleId = dep.asInstanceOf[ShuffleDependency[_, _, _]].shuffleId + cleaner.doCleanupShuffle(shuffleId, blocking) + } + val rdd = dep.rdd + val rddDeps = rdd.dependencies + if (rdd.getStorageLevel == StorageLevel.NONE && rddDeps != null) { + rddDeps.foreach(cleanEagerly) + } + } + deps.foreach(cleanEagerly) + } + } +} diff --git a/tools/kal-test/src/main/scala/org/apache/spark/ml/regression/KNNRegression.scala b/tools/kal-test/src/main/scala/org/apache/spark/ml/regression/KNNRegression.scala new file mode 100644 index 0000000..a5f397a --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/ml/regression/KNNRegression.scala @@ -0,0 +1,156 @@ +package org.apache.spark.ml.regression + +import org.apache.spark.broadcast.Broadcast +import org.apache.spark.ml.knn.{KNN, KNNModelParams, KNNParams, Tree} +import org.apache.spark.ml.param.ParamMap +import org.apache.spark.ml.param.shared.HasWeightCol +import org.apache.spark.ml.util.Identifiable +import org.apache.spark.ml.{PredictionModel, Predictor} +import org.apache.spark.ml.linalg.Vector +import org.apache.spark.rdd.RDD +import org.apache.spark.sql.{DataFrame, Dataset, Row} +import org.apache.spark.storage.StorageLevel + +/** + * [[https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm]] for regression. + * The output value is simply the average of the values of its k nearest neighbors. + */ +class KNNRegression(override val uid: String) extends Predictor[Vector, KNNRegression, KNNRegressionModel] +with KNNParams with HasWeightCol { + def this() = this(Identifiable.randomUID("knnr")) + + /** @group setParam */ + override def setFeaturesCol(value: String): this.type = set(featuresCol, value) + + /** @group setParam */ + override def setLabelCol(value: String): this.type = { + set(labelCol, value) + + if ($(weightCol).isEmpty) { + set(inputCols, Array(value)) + } else { + set(inputCols, Array(value, $(weightCol))) + } + } + + //fill in default label col + setDefault(inputCols, Array($(labelCol))) + + /** @group setWeight */ + def setWeightCol(value: String): this.type = { + set(weightCol, value) + + if (value.isEmpty) { + set(inputCols, Array($(labelCol))) + } else { + set(inputCols, Array($(labelCol), value)) + } + } + + setDefault(weightCol -> "") + + /** @group setParam */ + def setK(value: Int): this.type = set(k, value) + + /** @group setParam */ + def setTopTreeSize(value: Int): this.type = set(topTreeSize, value) + + /** @group setParam */ + def setTopTreeLeafSize(value: Int): this.type = set(topTreeLeafSize, value) + + /** @group setParam */ + def setSubTreeLeafSize(value: Int): this.type = set(subTreeLeafSize, value) + + /** @group setParam */ + def setBufferSizeSampleSizes(value: Array[Int]): this.type = set(bufferSizeSampleSizes, value) + + /** @group setParam */ + def setBalanceThreshold(value: Double): this.type = set(balanceThreshold, value) + + /** @group setParam */ + def setSeed(value: Long): this.type = set(seed, value) + + override protected def train(dataset: Dataset[_]): KNNRegressionModel = { + val knnModel = copyValues(new KNN()).fit(dataset) + knnModel.toNewRegressionModel(uid) + } + + override def fit(dataset: Dataset[_]): KNNRegressionModel = { + // Need to overwrite this method because we need to manually overwrite the buffer size + // because it is not supposed to stay the same as the Regressor if user sets it to -1. + transformSchema(dataset.schema, logging = true) + val model = train(dataset) + val bufferSize = model.getBufferSize + copyValues(model.setParent(this)).setBufferSize(bufferSize) + } + + override def copy(extra: ParamMap): KNNRegression = defaultCopy(extra) +} + +class KNNRegressionModel private[ml]( + override val uid: String, + val topTree: Broadcast[Tree], + val subTrees: RDD[Tree] + ) extends PredictionModel[Vector, KNNRegressionModel] +with KNNModelParams with HasWeightCol with Serializable { + require(subTrees.getStorageLevel != StorageLevel.NONE, + "KNNModel is not designed to work with Trees that have not been cached") + + /** @group setParam */ + def setK(value: Int): this.type = set(k, value) + + /** @group setParam */ + def setBufferSize(value: Double): this.type = set(bufferSize, value) + + //TODO: This can benefit from DataSet API in Spark 1.6 + override def transformImpl(dataset: Dataset[_]): DataFrame = { + val getWeight: Row => Double = { + if($(weightCol).isEmpty) { + r => 1.0 + } else { + r => r.getDouble(1) + } + } + + val neighborDataset : RDD[(Long, Array[(Row, Double)])] = transform(dataset, topTree, subTrees) + val merged = neighborDataset + .map { + case (id, labelsDists) => + val (labels, _) = labelsDists.unzip + var i = 0 + var weight = 0.0 + var sum = 0.0 + val length = labels.length + while (i < length) { + val row = labels(i) + val w = getWeight(row) + sum += row.getDouble(0) * w + weight += w + i += 1 + } + + (id, sum / weight) + } + + dataset.sqlContext.createDataFrame( + dataset.toDF().rdd.zipWithIndex().map { case (row, i) => (i, row) } + .leftOuterJoin(merged) //make sure we don't lose any observations + .map { + case (i, (row, value)) => Row.fromSeq(row.toSeq :+ value.get) + }, + transformSchema(dataset.schema) + ) + } + + override def copy(extra: ParamMap): KNNRegressionModel = { + val copied = new KNNRegressionModel(uid, topTree, subTrees) + copyValues(copied, extra).setParent(parent) + } + + override def predict(features: Vector): Double = { + val neighborDataset : RDD[(Long, Array[(Row, Double)])] = transform(subTrees.context.parallelize(Seq(features)), topTree, subTrees) + val results = neighborDataset.first()._2 + val labels = results.map(_._1.getDouble(0)) + labels.sum / labels.length + } +} diff --git a/tools/kal-test/src/main/scala/org/apache/spark/ml/tuning/BaseRange.scala b/tools/kal-test/src/main/scala/org/apache/spark/ml/tuning/BaseRange.scala new file mode 100644 index 0000000..76ef964 --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/ml/tuning/BaseRange.scala @@ -0,0 +1,77 @@ +// scalastyle:off header.matches +/* + * This file to You under the Apache License, Version 2.0; + * you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +package org.apache.spark.ml.tuning + +import scala.util.Random + +/** + * Abstract base class for IntervalRange, ContinueRange and DiscreteRange. + */ +abstract class BaseRange() { + val rd = new Random() + + /** + * Sample a value from a range of values. + * @return a value. + */ + def sampleOne(): Double +} + +/** Create a new range with the `start`, `end` and `step` values of this range. + * + * @param start the start value + * @param end the end value + * @param step the step value + */ +final case class IntervalRange(start: Double, end: Double, step: Double) extends BaseRange { + require(end != start, s"Upper boundary $end must not be equal to boundary $start") + require(step != 0.0, s"Step must not be equal to 0") + + val paramValues: List[Double] = (BigDecimal(start) to end by step).map(_.toDouble).toList + + /** + * Sample a value from a range of values. + * @return a value. + */ + override def sampleOne(): Double = { + paramValues(rd.nextInt(paramValues.length)) + } +} + +/** Create a new range with the `lower` and `upper` values of this range. + * + * @param lower the start value + * @param upper the end value + */ +final case class ContinueRange(lower: Double, upper: Double) extends BaseRange { + require(upper > lower, s"Upper boundary $upper must be greater than lower boundary $lower") + + /** + * sample a value from a range of values. + * @return a value. + */ + override def sampleOne(): Double = { + lower + (upper - lower) * rd.nextDouble() + } +} + +/** Create a new range with the discrete values set. + * + * @param paramValues set of discrete values. + */ +final case class DiscreteRange(paramValues: Seq[Double]) extends BaseRange { + + /** + * sample a value from a range of values. + * @return a value. + */ + override def sampleOne(): Double = { + paramValues(rd.nextInt(paramValues.length)) + } +} diff --git a/tools/kal-test/src/main/scala/org/apache/spark/ml/tuning/ParamSpace.scala b/tools/kal-test/src/main/scala/org/apache/spark/ml/tuning/ParamSpace.scala new file mode 100644 index 0000000..83c511b --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/ml/tuning/ParamSpace.scala @@ -0,0 +1,100 @@ +// scalastyle:off header.matches +/* + * This file to You under the Apache License, Version 2.0; + * you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +package org.apache.spark.ml.tuning + +import scala.collection.mutable.ArrayBuffer + +import org.apache.spark.ml.linalg.{Vector, Vectors} +import org.apache.spark.ml.param.{Param, ParamMap} + +/** + * Class of hyper-parameters space. + */ +class ParamSpace() { + var paramList: List[ParamType[_ <: AnyVal]] = List() + var paramNames: Array[Param[_ <: AnyVal]] = Array() + + /** + * Add IntType hyper-parameters. + * + * @param parent parent object. + * @param name param name. + * @param valueRange Range of parameter values. + */ + def addIntParam(parent: String, name: String, valueRange: BaseRange): Unit = { + val param = IntParamType(valueRange, parent, name) + paramList :+= param + paramNames :+= param.getParamName + } + + /** + * Add DoubleType hyper-parameters. + * + * @param parent parent object. + * @param name param name. + * @param valueRange Range of parameter values. + */ + def addDoubleParam(parent: String, name: String, valueRange: BaseRange): Unit = { + val param = DoubleParmType(valueRange, parent, name) + paramList :+= param + paramNames :+= param.getParamName + } + + private def asDouble(num: Any): Double = { + num match { + case i: Int => i.toDouble + case i: Long => i.toDouble + case i: Float => i.toDouble + case i: Double => i + case _ => throw new Exception(s"type ${num.getClass} is not supported") + } + } + + /** + * Get configuration values from paramMaps. + * + * @param configs param configurations. + * @return param value. + */ + def getConfigsValue(configs: Array[ParamMap]): Array[Vector] = { + val values: ArrayBuffer[Vector] = new ArrayBuffer[Vector] + for {config <- configs} { + var vectorArray: Array[Double] = Array() + paramNames.foreach { paramNames => + vectorArray :+= asDouble(config(paramNames)) + } + values.append(Vectors.dense(vectorArray)) + } + values.toArray + } + + /** + * Get some recommended configurations. + * + * @param size configuration number. + * @return configurations and configuration value vectors. + */ + def getConfigurations(size: Int): (Array[ParamMap], Array[Vector]) = { + val configs: ArrayBuffer[ParamMap] = new ArrayBuffer[ParamMap] + val values: ArrayBuffer[Vector] = new ArrayBuffer[Vector] + for {iter <- 1 to size} { + val paramMap = ParamMap.empty + var vectorArray: Array[Double] = Array() + paramList.foreach(param => { + val x = param.giveParamPair() + paramMap.put(x) + vectorArray :+= asDouble(x.value) + + }) + configs.append(paramMap) + values.append(Vectors.dense(vectorArray)) + } + (configs.toArray, values.toArray) + } +} diff --git a/tools/kal-test/src/main/scala/org/apache/spark/ml/tuning/ParamType.scala b/tools/kal-test/src/main/scala/org/apache/spark/ml/tuning/ParamType.scala new file mode 100644 index 0000000..838d7ea --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/ml/tuning/ParamType.scala @@ -0,0 +1,94 @@ +// scalastyle:off header.matches +/* + * This file to You under the Apache License, Version 2.0; + * you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + */ + +package org.apache.spark.ml.tuning + +import org.apache.spark.ml.param.{Param, ParamPair} + +/** + * Abstract base class for IntParamType and DoubleParmType. + */ +abstract class ParamType[T <: AnyVal] { + /** + * Sample one param from valueRange. + * @return a param value. + */ + def sampleOne(): T + + /** + * get param name. + * @return param name. + */ + def getParamName: Param[T] + + /** + * get param name and sample one param. + * @return a param and its value. + */ + def giveParamPair(): ParamPair[T] +} + +/** + * Param for int type values. + * + * @param valueRange range of param values. + * @param parent parent object. + * @param name param name. + */ +final case class IntParamType(valueRange: BaseRange, parent: String, name: String) + extends ParamType[Int] { + val paramName: Param[Int] = new Param(parent, name, "") + + /** + * Sample one param from valueRange. + * @return a param value. + */ + override def sampleOne(): Int = valueRange.sampleOne().toInt + + /** + * get param name. + * @return param name. + */ + override def getParamName: Param[Int] = paramName + + /** + * get param name and sample one param. + * @return a param and its value. + */ + override def giveParamPair(): ParamPair[Int] = ParamPair(getParamName, sampleOne()) +} + +/** + * Param for Double type values. + * + * @param valueRange range of param values. + * @param parent parent object. + * @param name param name. + */ +final case class DoubleParmType(valueRange: BaseRange, parent: String, name: String) + extends ParamType[Double] { + val paramName: Param[Double] = new Param(parent, name, "") + + /** + * Sample one param from valueRange. + * @return a param value. + */ + override def sampleOne(): Double = valueRange.sampleOne() + + /** + * get param name and sample one param. + * @return a param and its value. + */ + override def getParamName: Param[Double] = paramName + + /** + * get param name and sample one param. + * @return a param and its value. + */ + override def giveParamPair(): ParamPair[Double] = ParamPair(paramName, sampleOne()) +} diff --git a/tools/kal-test/src/main/scala/org/apache/spark/mllib/knn/KNNUtils.scala b/tools/kal-test/src/main/scala/org/apache/spark/mllib/knn/KNNUtils.scala new file mode 100644 index 0000000..16aa41f --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/mllib/knn/KNNUtils.scala @@ -0,0 +1,20 @@ +package org.apache.spark.mllib.knn + +import org.apache.spark.ml.{linalg => newlinalg} +import org.apache.spark.mllib.{linalg => oldlinalg} +import org.apache.spark.mllib.util.MLUtils + +object KNNUtils { + + import oldlinalg.VectorImplicits._ + + def fastSquaredDistance( + v1: newlinalg.Vector, + norm1: Double, + v2: newlinalg.Vector, + norm2: Double, + precision: Double = 1e-6): Double = { + MLUtils.fastSquaredDistance(v1, norm1, v2, norm2, precision) + } + +} diff --git a/tools/kal-test/src/main/scala/org/apache/spark/mllib/tree/DTBucketModelHelper.scala b/tools/kal-test/src/main/scala/org/apache/spark/mllib/tree/DTBucketModelHelper.scala new file mode 100644 index 0000000..f9ac117 --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/mllib/tree/DTBucketModelHelper.scala @@ -0,0 +1,17 @@ +package org.apache.spark.mllib.tree + +import org.apache.spark.ml.classification.DecisionTreeClassificationModel +import org.apache.spark.mllib.tree.model.{DecisionTreeModel => MllibDecisionTreeModel} + +package object helper { + type NodeData = org.apache.spark.mllib.tree.model.DecisionTreeModel.SaveLoadV1_0.NodeData + val NodeData = org.apache.spark.mllib.tree.model.DecisionTreeModel.SaveLoadV1_0.NodeData + type SplitData = org.apache.spark.mllib.tree.model.DecisionTreeModel.SaveLoadV1_0.SplitData + val SplitData = org.apache.spark.mllib.tree.model.DecisionTreeModel.SaveLoadV1_0.SplitData + type PredictData = org.apache.spark.mllib.tree.model.DecisionTreeModel.SaveLoadV1_0.PredictData + val PredictData = org.apache.spark.mllib.tree.model.DecisionTreeModel.SaveLoadV1_0.PredictData + + def toOldDTModel(dtModel: DecisionTreeClassificationModel): MllibDecisionTreeModel = { + dtModel.toOld + } +} diff --git a/tools/kal-test/src/main/scala/org/apache/spark/util/PublicThreadUtils.scala b/tools/kal-test/src/main/scala/org/apache/spark/util/PublicThreadUtils.scala new file mode 100644 index 0000000..750e7a3 --- /dev/null +++ b/tools/kal-test/src/main/scala/org/apache/spark/util/PublicThreadUtils.scala @@ -0,0 +1,6 @@ +//scalastyle:off +package org.apache.spark.util + +object PublicThreadUtils { + val utils: ThreadUtils.type = ThreadUtils +} -- Gitee