From bf9f7e1365f1cceccf7afa35053b757f7beafbb4 Mon Sep 17 00:00:00 2001 From: wyy566 <531938832@qq.com> Date: Thu, 27 Oct 2022 10:54:49 +0800 Subject: [PATCH] 1)add dtb comparision script 2)fix mvn command in README.md --- tools/kal-test/README.md | 4 +- .../kal-test/bin/compare/ml/major_compare.sh | 4 +- tools/kal-test/bin/ml/dtb_run.sh | 2 +- .../com/bigdata/compare/ml/DTBVerify.scala | 44 +++++++++++++++++++ 4 files changed, 50 insertions(+), 4 deletions(-) create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/ml/DTBVerify.scala diff --git a/tools/kal-test/README.md b/tools/kal-test/README.md index e4d557c..6aa903b 100644 --- a/tools/kal-test/README.md +++ b/tools/kal-test/README.md @@ -14,8 +14,8 @@ The Kunpeng algorithm library test tool can be used to test machine learning and 1. Go to the Spark-ml-algo-lib/tools/kal-test directory in the compilation environment. 2. Install the dependencies.
Take spark 2.3.2 as an example, the install command is as follows:
-   mvn install:install-file -DgroupId=org.apache.spark -DartifactId=boostkit-graph-kernel-client_2.11 -Dversion -Dclassifier=spark2.3.2 -Dfile=boostkit-graph-kernel-client_2.11-2.1.0-spark2.3.2.jar -Dpackaging=jar -DgeneratePom=true
-   mvn install:install-file -DgroupId=org.apache.spark -DartifactId=boostkit-ml-kernel-client_2.11 -Dversion -Dclassifier=spark2.3.2 -Dfile=boostkit-ml-kernel-client_2.11-2.1.0-spark2.3.2.jar -Dpackaging=jar -DgeneratePom=true + mvn install:install-file -DgroupId=org.apache.spark -DartifactId=boostkit-graph-kernel-client_2.11 -Dversion=2.1.0 -Dclassifier=spark2.3.2 -Dfile=boostkit-graph-kernel-client_2.11-2.1.0-spark2.3.2.jar -Dpackaging=jar -DgeneratePom=true
+ mvn install:install-file -DgroupId=org.apache.spark -DartifactId=boostkit-ml-kernel-client_2.11 -Dversion=2.1.0 -Dclassifier=spark2.3.2 -Dfile=boostkit-ml-kernel-client_2.11-2.1.0-spark2.3.2.jar -Dpackaging=jar -DgeneratePom=true 3. Run the compile command:
mvn clean install 4. View the kal-test_2.11-0.1.jar file generated in Spark-ml-algo-lib/tools/kal-test/target. diff --git a/tools/kal-test/bin/compare/ml/major_compare.sh b/tools/kal-test/bin/compare/ml/major_compare.sh index 6972db0..b79b0f9 100644 --- a/tools/kal-test/bin/compare/ml/major_compare.sh +++ b/tools/kal-test/bin/compare/ml/major_compare.sh @@ -5,7 +5,7 @@ function usage() { echo "Usage: " echo "1st argument: algorithm name: als kmeans linr svm dtr(dt.regression) gbdtr(gbdt.regression) rfr(rf.regression) xgbtr(xgbt.regression) lda logr svm dtc(dt.classification) gbdtc(gbdt.classification) rfc(rf.classification) xgbtc(xgbt.classification) - cov pca pearson spca spearman lda ps svd" + cov pca pearson spca spearman lda ps svd dtb" echo "2st argument: path of opt result: eg [hdfs:///tmp/ml/result/RF/classification_epsilon_dataframe_fit1]" echo "3nd argument: path of raw result: eg [hdfs:///tmp/ml/result/RF/classification_epsilon_dataframe_fit1_raw]" echo "Applicable to algorithm ALS KMeans LinR SVM GBDT.regression RF.regression XGBT.regression" @@ -39,6 +39,8 @@ elif [ $alg == "ps" ]; then class_name=com.bigdata.compare.ml.PrefixSpanVerify elif [ $alg == "svd" ] ; then class_name=com.bigdata.compare.ml.SVDVerify +elif [ $alg == "dtb" ] ; then + class_name=com.bigdata.compare.ml.DTBVerify else alg_usage exit 0 diff --git a/tools/kal-test/bin/ml/dtb_run.sh b/tools/kal-test/bin/ml/dtb_run.sh index 5552874..efc4cd7 100644 --- a/tools/kal-test/bin/ml/dtb_run.sh +++ b/tools/kal-test/bin/ml/dtb_run.sh @@ -88,7 +88,7 @@ scala_version_val=${!scala_version} data_path_val=${!dataset_name} echo "${dataset_name} : ${data_path_val}" -bucketedResPath="/tmp/ml/res/DTB_ref_bucketedRes/${spark_version_val}/${dataset_name}" +bucketedResPath="/tmp/ml/res/DTB_ref_bucketedRes/${is_raw}/${spark_version_val}/${dataset_name}" hdfs dfs -mkdir -p ${bucketedResPath} spark_conf=${master_val}_${deploy_mode_val}_${num_executors_val}_${executor_cores_val}_${executor_memory_val} diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/DTBVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/DTBVerify.scala new file mode 100644 index 0000000..9455291 --- /dev/null +++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/DTBVerify.scala @@ -0,0 +1,44 @@ +package com.bigdata.compare.ml + +import org.apache.spark.{SparkConf, SparkContext} +import org.apache.hadoop.fs.{FileSystem, Path} +import org.apache.spark.sql.SparkSession + +import java.io.FileWriter + +object DTBVerify { + def main(args: Array[String]): Unit = { + val path0 = args(0) + val path1 = args(1) + val sparkConf = new SparkConf().setAppName("DTBVerify") + val spark = SparkSession.builder.config(sparkConf).getOrCreate() + val isCorrect = compareRes(path0, path1, spark) + val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true) + writerIsCorrect.write(s"${isCorrect}:\n${path0}\n${path1}\n") + writerIsCorrect.close() + println(s"${isCorrect}!") + + } + + def compareRes(path0: String, path1: String, spark: SparkSession): String = { + val sc = spark.sparkContext + val fs = FileSystem.get(sc.hadoopConfiguration) + val res1File = new Path(path0) + val res2File = new Path(path1) + if (!fs.exists(res1File) || !fs.exists(res2File)) { + return "invaildComparison" + } + + val output = sc.textFile(path0).repartition(100) + val refRes = sc.textFile(path1).repartition(100) + val dataDiff1Cnt = output.subtract(refRes).count() + val dataDiff2Cnt = refRes.subtract(output).count() + if (dataDiff1Cnt != 0 || dataDiff2Cnt != 0) { + System.err.println(s"[ERROR] diff1Cnt: ${dataDiff1Cnt}, diff2Cnt: ${dataDiff2Cnt}") + System.err.println("output data is mismatch!") + return "incorrect" + } else { + return "correct" + } + } +} \ No newline at end of file -- Gitee