From bf9f7e1365f1cceccf7afa35053b757f7beafbb4 Mon Sep 17 00:00:00 2001
From: wyy566 <531938832@qq.com>
Date: Thu, 27 Oct 2022 10:54:49 +0800
Subject: [PATCH] 1)add dtb comparision script 2)fix mvn command in README.md
---
tools/kal-test/README.md | 4 +-
.../kal-test/bin/compare/ml/major_compare.sh | 4 +-
tools/kal-test/bin/ml/dtb_run.sh | 2 +-
.../com/bigdata/compare/ml/DTBVerify.scala | 44 +++++++++++++++++++
4 files changed, 50 insertions(+), 4 deletions(-)
create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/ml/DTBVerify.scala
diff --git a/tools/kal-test/README.md b/tools/kal-test/README.md
index e4d557c..6aa903b 100644
--- a/tools/kal-test/README.md
+++ b/tools/kal-test/README.md
@@ -14,8 +14,8 @@ The Kunpeng algorithm library test tool can be used to test machine learning and
1. Go to the Spark-ml-algo-lib/tools/kal-test directory in the compilation environment.
2. Install the dependencies.
Take spark 2.3.2 as an example, the install command is as follows:
- mvn install:install-file -DgroupId=org.apache.spark -DartifactId=boostkit-graph-kernel-client_2.11 -Dversion -Dclassifier=spark2.3.2 -Dfile=boostkit-graph-kernel-client_2.11-2.1.0-spark2.3.2.jar -Dpackaging=jar -DgeneratePom=true
- mvn install:install-file -DgroupId=org.apache.spark -DartifactId=boostkit-ml-kernel-client_2.11 -Dversion -Dclassifier=spark2.3.2 -Dfile=boostkit-ml-kernel-client_2.11-2.1.0-spark2.3.2.jar -Dpackaging=jar -DgeneratePom=true
+ mvn install:install-file -DgroupId=org.apache.spark -DartifactId=boostkit-graph-kernel-client_2.11 -Dversion=2.1.0 -Dclassifier=spark2.3.2 -Dfile=boostkit-graph-kernel-client_2.11-2.1.0-spark2.3.2.jar -Dpackaging=jar -DgeneratePom=true
+ mvn install:install-file -DgroupId=org.apache.spark -DartifactId=boostkit-ml-kernel-client_2.11 -Dversion=2.1.0 -Dclassifier=spark2.3.2 -Dfile=boostkit-ml-kernel-client_2.11-2.1.0-spark2.3.2.jar -Dpackaging=jar -DgeneratePom=true
3. Run the compile command:
mvn clean install
4. View the kal-test_2.11-0.1.jar file generated in Spark-ml-algo-lib/tools/kal-test/target.
diff --git a/tools/kal-test/bin/compare/ml/major_compare.sh b/tools/kal-test/bin/compare/ml/major_compare.sh
index 6972db0..b79b0f9 100644
--- a/tools/kal-test/bin/compare/ml/major_compare.sh
+++ b/tools/kal-test/bin/compare/ml/major_compare.sh
@@ -5,7 +5,7 @@ function usage() {
echo "Usage: "
echo "1st argument: algorithm name: als kmeans linr svm dtr(dt.regression) gbdtr(gbdt.regression) rfr(rf.regression) xgbtr(xgbt.regression)
lda logr svm dtc(dt.classification) gbdtc(gbdt.classification) rfc(rf.classification) xgbtc(xgbt.classification)
- cov pca pearson spca spearman lda ps svd"
+ cov pca pearson spca spearman lda ps svd dtb"
echo "2st argument: path of opt result: eg [hdfs:///tmp/ml/result/RF/classification_epsilon_dataframe_fit1]"
echo "3nd argument: path of raw result: eg [hdfs:///tmp/ml/result/RF/classification_epsilon_dataframe_fit1_raw]"
echo "Applicable to algorithm ALS KMeans LinR SVM GBDT.regression RF.regression XGBT.regression"
@@ -39,6 +39,8 @@ elif [ $alg == "ps" ]; then
class_name=com.bigdata.compare.ml.PrefixSpanVerify
elif [ $alg == "svd" ] ; then
class_name=com.bigdata.compare.ml.SVDVerify
+elif [ $alg == "dtb" ] ; then
+ class_name=com.bigdata.compare.ml.DTBVerify
else
alg_usage
exit 0
diff --git a/tools/kal-test/bin/ml/dtb_run.sh b/tools/kal-test/bin/ml/dtb_run.sh
index 5552874..efc4cd7 100644
--- a/tools/kal-test/bin/ml/dtb_run.sh
+++ b/tools/kal-test/bin/ml/dtb_run.sh
@@ -88,7 +88,7 @@ scala_version_val=${!scala_version}
data_path_val=${!dataset_name}
echo "${dataset_name} : ${data_path_val}"
-bucketedResPath="/tmp/ml/res/DTB_ref_bucketedRes/${spark_version_val}/${dataset_name}"
+bucketedResPath="/tmp/ml/res/DTB_ref_bucketedRes/${is_raw}/${spark_version_val}/${dataset_name}"
hdfs dfs -mkdir -p ${bucketedResPath}
spark_conf=${master_val}_${deploy_mode_val}_${num_executors_val}_${executor_cores_val}_${executor_memory_val}
diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/DTBVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/DTBVerify.scala
new file mode 100644
index 0000000..9455291
--- /dev/null
+++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/DTBVerify.scala
@@ -0,0 +1,44 @@
+package com.bigdata.compare.ml
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.spark.sql.SparkSession
+
+import java.io.FileWriter
+
+object DTBVerify {
+ def main(args: Array[String]): Unit = {
+ val path0 = args(0)
+ val path1 = args(1)
+ val sparkConf = new SparkConf().setAppName("DTBVerify")
+ val spark = SparkSession.builder.config(sparkConf).getOrCreate()
+ val isCorrect = compareRes(path0, path1, spark)
+ val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true)
+ writerIsCorrect.write(s"${isCorrect}:\n${path0}\n${path1}\n")
+ writerIsCorrect.close()
+ println(s"${isCorrect}!")
+
+ }
+
+ def compareRes(path0: String, path1: String, spark: SparkSession): String = {
+ val sc = spark.sparkContext
+ val fs = FileSystem.get(sc.hadoopConfiguration)
+ val res1File = new Path(path0)
+ val res2File = new Path(path1)
+ if (!fs.exists(res1File) || !fs.exists(res2File)) {
+ return "invaildComparison"
+ }
+
+ val output = sc.textFile(path0).repartition(100)
+ val refRes = sc.textFile(path1).repartition(100)
+ val dataDiff1Cnt = output.subtract(refRes).count()
+ val dataDiff2Cnt = refRes.subtract(output).count()
+ if (dataDiff1Cnt != 0 || dataDiff2Cnt != 0) {
+ System.err.println(s"[ERROR] diff1Cnt: ${dataDiff1Cnt}, diff2Cnt: ${dataDiff2Cnt}")
+ System.err.println("output data is mismatch!")
+ return "incorrect"
+ } else {
+ return "correct"
+ }
+ }
+}
\ No newline at end of file
--
Gitee