From bf9f7e1365f1cceccf7afa35053b757f7beafbb4 Mon Sep 17 00:00:00 2001
From: wyy566 <531938832@qq.com>
Date: Thu, 27 Oct 2022 10:54:49 +0800
Subject: [PATCH] 1)add dtb comparision script 2)fix mvn command in README.md

---
 tools/kal-test/README.md                      |  4 +-
 .../kal-test/bin/compare/ml/major_compare.sh  |  4 +-
 tools/kal-test/bin/ml/dtb_run.sh              |  2 +-
 .../com/bigdata/compare/ml/DTBVerify.scala    | 44 +++++++++++++++++++
 4 files changed, 50 insertions(+), 4 deletions(-)
 create mode 100644 tools/kal-test/src/main/scala/com/bigdata/compare/ml/DTBVerify.scala
diff --git a/tools/kal-test/README.md b/tools/kal-test/README.md
index e4d557c..6aa903b 100644
--- a/tools/kal-test/README.md
+++ b/tools/kal-test/README.md
@@ -14,8 +14,8 @@ The Kunpeng algorithm library test tool can be used to test machine learning and
 1. Go to the Spark-ml-algo-lib/tools/kal-test directory in the compilation environment.
 2. Install the dependencies.<br/>
    Take spark 2.3.2 as an example, the install command is as follows:<br/>
-   &emsp;&emsp;mvn install:install-file -DgroupId=org.apache.spark -DartifactId=boostkit-graph-kernel-client_2.11 -Dversion -Dclassifier=spark2.3.2 -Dfile=boostkit-graph-kernel-client_2.11-2.1.0-spark2.3.2.jar -Dpackaging=jar -DgeneratePom=true<br/>
-   &emsp;&emsp;mvn install:install-file -DgroupId=org.apache.spark -DartifactId=boostkit-ml-kernel-client_2.11 -Dversion -Dclassifier=spark2.3.2 -Dfile=boostkit-ml-kernel-client_2.11-2.1.0-spark2.3.2.jar -Dpackaging=jar -DgeneratePom=true
+   mvn install:install-file -DgroupId=org.apache.spark -DartifactId=boostkit-graph-kernel-client_2.11 -Dversion=2.1.0 -Dclassifier=spark2.3.2 -Dfile=boostkit-graph-kernel-client_2.11-2.1.0-spark2.3.2.jar -Dpackaging=jar -DgeneratePom=true<br/>
+   mvn install:install-file -DgroupId=org.apache.spark -DartifactId=boostkit-ml-kernel-client_2.11 -Dversion=2.1.0 -Dclassifier=spark2.3.2 -Dfile=boostkit-ml-kernel-client_2.11-2.1.0-spark2.3.2.jar -Dpackaging=jar -DgeneratePom=true
 3. Run the compile command:<br/>
    mvn clean install
 4. View the kal-test_2.11-0.1.jar file generated in Spark-ml-algo-lib/tools/kal-test/target.
diff --git a/tools/kal-test/bin/compare/ml/major_compare.sh b/tools/kal-test/bin/compare/ml/major_compare.sh
index 6972db0..b79b0f9 100644
--- a/tools/kal-test/bin/compare/ml/major_compare.sh
+++ b/tools/kal-test/bin/compare/ml/major_compare.sh
@@ -5,7 +5,7 @@ function usage() {
   echo "Usage: <algorithm name> <path0> <path1>"
   echo "1st argument: algorithm name: als kmeans linr svm dtr(dt.regression) gbdtr(gbdt.regression) rfr(rf.regression) xgbtr(xgbt.regression)
                                       lda logr svm dtc(dt.classification) gbdtc(gbdt.classification) rfc(rf.classification) xgbtc(xgbt.classification)
-                                      cov pca pearson spca spearman lda ps svd"
+                                      cov pca pearson spca spearman lda ps svd dtb"
   echo "2st argument: path of opt result: eg [hdfs:///tmp/ml/result/RF/classification_epsilon_dataframe_fit1]"
   echo "3nd argument: path of raw result: eg [hdfs:///tmp/ml/result/RF/classification_epsilon_dataframe_fit1_raw]"
   echo "Applicable to algorithm ALS KMeans LinR SVM GBDT.regression RF.regression XGBT.regression"
@@ -39,6 +39,8 @@ elif [ $alg == "ps" ]; then
   class_name=com.bigdata.compare.ml.PrefixSpanVerify
 elif [ $alg == "svd" ] ; then
   class_name=com.bigdata.compare.ml.SVDVerify
+elif [ $alg == "dtb" ] ; then
+  class_name=com.bigdata.compare.ml.DTBVerify
 else
   alg_usage
   exit 0
diff --git a/tools/kal-test/bin/ml/dtb_run.sh b/tools/kal-test/bin/ml/dtb_run.sh
index 5552874..efc4cd7 100644
--- a/tools/kal-test/bin/ml/dtb_run.sh
+++ b/tools/kal-test/bin/ml/dtb_run.sh
@@ -88,7 +88,7 @@ scala_version_val=${!scala_version}
 data_path_val=${!dataset_name}
 echo "${dataset_name} : ${data_path_val}"
 
-bucketedResPath="/tmp/ml/res/DTB_ref_bucketedRes/${spark_version_val}/${dataset_name}"
+bucketedResPath="/tmp/ml/res/DTB_ref_bucketedRes/${is_raw}/${spark_version_val}/${dataset_name}"
 hdfs dfs -mkdir -p ${bucketedResPath}
 
 spark_conf=${master_val}_${deploy_mode_val}_${num_executors_val}_${executor_cores_val}_${executor_memory_val}
diff --git a/tools/kal-test/src/main/scala/com/bigdata/compare/ml/DTBVerify.scala b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/DTBVerify.scala
new file mode 100644
index 0000000..9455291
--- /dev/null
+++ b/tools/kal-test/src/main/scala/com/bigdata/compare/ml/DTBVerify.scala
@@ -0,0 +1,44 @@
+package com.bigdata.compare.ml
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.spark.sql.SparkSession
+
+import java.io.FileWriter
+
+object DTBVerify {
+  def main(args: Array[String]): Unit = {
+    val path0 = args(0)
+    val path1 = args(1)
+    val sparkConf =  new SparkConf().setAppName("DTBVerify")
+    val spark = SparkSession.builder.config(sparkConf).getOrCreate()
+    val isCorrect = compareRes(path0, path1, spark)
+    val writerIsCorrect = new FileWriter(s"report/ml_isCorrect.txt", true)
+    writerIsCorrect.write(s"${isCorrect}:\n${path0}\n${path1}\n")
+    writerIsCorrect.close()
+    println(s"${isCorrect}!")
+
+  }
+
+  def compareRes(path0: String, path1: String, spark: SparkSession): String = {
+    val sc = spark.sparkContext
+    val fs = FileSystem.get(sc.hadoopConfiguration)
+    val res1File = new Path(path0)
+    val res2File = new Path(path1)
+    if (!fs.exists(res1File) || !fs.exists(res2File)) {
+      return "invaildComparison"
+    }
+
+    val output = sc.textFile(path0).repartition(100)
+    val refRes = sc.textFile(path1).repartition(100)
+    val dataDiff1Cnt = output.subtract(refRes).count()
+    val dataDiff2Cnt = refRes.subtract(output).count()
+    if (dataDiff1Cnt != 0 || dataDiff2Cnt != 0) {
+      System.err.println(s"[ERROR] diff1Cnt: ${dataDiff1Cnt}, diff2Cnt: ${dataDiff2Cnt}")
+      System.err.println("output data is mismatch!")
+      return "incorrect"
+    } else {
+      return "correct"
+    }
+  }
+}
\ No newline at end of file
-- 
Gitee