From c77ca07725884bbec4ba5940e731dc36f8f71777 Mon Sep 17 00:00:00 2001
From: Johannes Altmanninger <aclopte@gmail.com>
Date: Sun, 20 Aug 2017 12:09:07 +0000
Subject: [PATCH] [clang-diff] Fix similarity computation

Summary:
Add separate tests for the top-down and the bottom-up phase, as well as
one for the optimal matching.

Reviewers: arphaman

Subscribers: klimek

Differential Revision: https://reviews.llvm.org/D36185

git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@311284 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/clang/Tooling/ASTDiff/ASTDiff.h |  4 ++-
 lib/Tooling/ASTDiff/ASTDiff.cpp         | 35 +++++++++++-------
 test/Tooling/clang-diff-bottomup.cpp    | 39 ++++++++++++++++++++
 test/Tooling/clang-diff-opt.cpp         | 44 +++++++++++++++++++++++
 test/Tooling/clang-diff-topdown.cpp     | 48 +++++++++++++++++++++++++
 tools/clang-diff/ClangDiff.cpp          | 13 +++++++
 6 files changed, 169 insertions(+), 14 deletions(-)
 create mode 100644 test/Tooling/clang-diff-bottomup.cpp
 create mode 100644 test/Tooling/clang-diff-opt.cpp
 create mode 100644 test/Tooling/clang-diff-topdown.cpp

diff --git a/include/clang/Tooling/ASTDiff/ASTDiff.h b/include/clang/Tooling/ASTDiff/ASTDiff.h
index ac33dc42d81..3260e2f057c 100644
--- a/include/clang/Tooling/ASTDiff/ASTDiff.h
+++ b/include/clang/Tooling/ASTDiff/ASTDiff.h
@@ -105,12 +105,14 @@ struct ComparisonOptions {
 
   /// During bottom-up matching, match only nodes with at least this value as
   /// the ratio of their common descendants.
-  double MinSimilarity = 0.2;
+  double MinSimilarity = 0.5;
 
   /// Whenever two subtrees are matched in the bottom-up phase, the optimal
   /// mapping is computed, unless the size of either subtrees exceeds this.
   int MaxSize = 100;
 
+  bool StopAfterTopDown = false;
+
   /// Returns false if the nodes should never be matched.
   bool isMatchingAllowed(const Node &N1, const Node &N2) const {
     return N1.getType().isSame(N2.getType());
diff --git a/lib/Tooling/ASTDiff/ASTDiff.cpp b/lib/Tooling/ASTDiff/ASTDiff.cpp
index ada7cbcfc55..22f3a4e4f06 100644
--- a/lib/Tooling/ASTDiff/ASTDiff.cpp
+++ b/lib/Tooling/ASTDiff/ASTDiff.cpp
@@ -92,7 +92,7 @@ private:
 
   // Computes the ratio of common descendants between the two nodes.
   // Descendants are only considered to be equal when they are mapped in M.
-  double getSimilarity(const Mapping &M, NodeId Id1, NodeId Id2) const;
+  double getJaccardSimilarity(const Mapping &M, NodeId Id1, NodeId Id2) const;
 
   // Returns the node that has the highest degree of similarity.
   NodeId findCandidate(const Mapping &M, NodeId Id1) const;
@@ -714,8 +714,8 @@ bool ASTDiff::Impl::haveSameParents(const Mapping &M, NodeId Id1,
 
 void ASTDiff::Impl::addOptimalMapping(Mapping &M, NodeId Id1,
                                       NodeId Id2) const {
-  if (std::max(T1.getNumberOfDescendants(Id1),
-               T2.getNumberOfDescendants(Id2)) >= Options.MaxSize)
+  if (std::max(T1.getNumberOfDescendants(Id1), T2.getNumberOfDescendants(Id2)) >
+      Options.MaxSize)
     return;
   ZhangShashaMatcher Matcher(*this, T1, T2, Id1, Id2);
   std::vector<std::pair<NodeId, NodeId>> R = Matcher.getMatchingNodes();
@@ -727,16 +727,23 @@ void ASTDiff::Impl::addOptimalMapping(Mapping &M, NodeId Id1,
   }
 }
 
-double ASTDiff::Impl::getSimilarity(const Mapping &M, NodeId Id1,
-                                    NodeId Id2) const {
-  if (Id1.isInvalid() || Id2.isInvalid())
-    return 0.0;
+double ASTDiff::Impl::getJaccardSimilarity(const Mapping &M, NodeId Id1,
+                                           NodeId Id2) const {
   int CommonDescendants = 0;
   const Node &N1 = T1.getNode(Id1);
-  for (NodeId Id = Id1 + 1; Id <= N1.RightMostDescendant; ++Id)
-    CommonDescendants += int(T2.isInSubtree(M.getDst(Id), Id2));
-  return 2.0 * CommonDescendants /
-         (T1.getNumberOfDescendants(Id1) + T2.getNumberOfDescendants(Id2));
+  // Count the common descendants, excluding the subtree root.
+  for (NodeId Src = Id1 + 1; Src <= N1.RightMostDescendant; ++Src) {
+    NodeId Dst = M.getDst(Src);
+    CommonDescendants += int(Dst.isValid() && T2.isInSubtree(Dst, Id2));
+  }
+  // We need to subtract 1 to get the number of descendants excluding the root.
+  double Denominator = T1.getNumberOfDescendants(Id1) - 1 +
+                       T2.getNumberOfDescendants(Id2) - 1 - CommonDescendants;
+  // CommonDescendants is less than the size of one subtree.
+  assert(Denominator >= 0 && "Expected non-negative denominator.");
+  if (Denominator == 0)
+    return 0;
+  return CommonDescendants / Denominator;
 }
 
 NodeId ASTDiff::Impl::findCandidate(const Mapping &M, NodeId Id1) const {
@@ -747,7 +754,7 @@ NodeId ASTDiff::Impl::findCandidate(const Mapping &M, NodeId Id1) const {
       continue;
     if (M.hasDst(Id2))
       continue;
-    double Similarity = getSimilarity(M, Id1, Id2);
+    double Similarity = getJaccardSimilarity(M, Id1, Id2);
     if (Similarity >= Options.MinSimilarity && Similarity > HighestSimilarity) {
       HighestSimilarity = Similarity;
       Candidate = Id2;
@@ -767,8 +774,8 @@ void ASTDiff::Impl::matchBottomUp(Mapping &M) const {
       }
       break;
     }
-    const Node &N1 = T1.getNode(Id1);
     bool Matched = M.hasSrc(Id1);
+    const Node &N1 = T1.getNode(Id1);
     bool MatchedChildren =
         std::any_of(N1.Children.begin(), N1.Children.end(),
                     [&](NodeId Child) { return M.hasSrc(Child); });
@@ -836,6 +843,8 @@ ASTDiff::Impl::Impl(SyntaxTree::Impl &T1, SyntaxTree::Impl &T2,
 
 void ASTDiff::Impl::computeMapping() {
   TheMapping = matchTopDown();
+  if (Options.StopAfterTopDown)
+    return;
   matchBottomUp(TheMapping);
 }
 
diff --git a/test/Tooling/clang-diff-bottomup.cpp b/test/Tooling/clang-diff-bottomup.cpp
new file mode 100644
index 00000000000..8193ad7bbd6
--- /dev/null
+++ b/test/Tooling/clang-diff-bottomup.cpp
@@ -0,0 +1,39 @@
+// RUN: %clang_cc1 -E %s > %t.src.cpp
+// RUN: %clang_cc1 -E %s > %t.dst.cpp -DDEST
+// RUN: clang-diff -dump-matches -s=0 %t.src.cpp %t.dst.cpp -- | FileCheck %s
+//
+// Test the bottom-up matching, with maxsize set to 0, so that the optimal matching will never be applied.
+
+#ifndef DEST
+
+void f1() { ; {{;}} }
+void f2() { ;; {{;}} }
+
+#else
+
+// Jaccard similarity threshold is 0.5.
+
+void f1() {
+// CompoundStmt: 3 matched descendants, subtree sizes 4 and 5
+// Jaccard similarity = 3 / (4 + 5 - 3) = 3 / 6 >= 0.5
+// CHECK: Match FunctionDecl: f1(void (void))(1) to FunctionDecl: f1(void (void))(1)
+// CHECK: Match CompoundStmt(2) to CompoundStmt(2)
+// CHECK: Match CompoundStmt(4) to CompoundStmt(3)
+// CHECK: Match CompoundStmt(5) to CompoundStmt(4)
+// CHECK: Match NullStmt(6) to NullStmt(5)
+  {{;}} ;;
+}
+
+void f2() {
+// CompoundStmt: 3 matched descendants, subtree sizes 4 and 5
+// Jaccard similarity = 3 / (5 + 6 - 3) = 3 / 8 < 0.5
+// CHECK-NOT: Match FunctionDecl(9)
+// CHECK-NOT: Match CompoundStmt(10)
+// CHECK: Match CompoundStmt(11) to CompoundStmt(10)
+// CHECK: Match CompoundStmt(12) to CompoundStmt(11)
+// CHECK: Match NullStmt(13) to NullStmt(12)
+// CHECK-NOT: Match NullStmt(13)
+  {{;}} ;;;
+}
+
+#endif
diff --git a/test/Tooling/clang-diff-opt.cpp b/test/Tooling/clang-diff-opt.cpp
new file mode 100644
index 00000000000..567483de1f2
--- /dev/null
+++ b/test/Tooling/clang-diff-opt.cpp
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -E %s > %t.src.cpp
+// RUN: %clang_cc1 -E %s > %t.dst.cpp -DDEST
+// RUN: clang-diff -dump-matches -s=10 %t.src.cpp %t.dst.cpp -- | FileCheck %s
+//
+// Test the behaviour of the matching according to the optimal tree edit
+// distance, implemented with Zhang and Shasha's algorithm.
+// Just for testing we use a tiny value of 10 for maxsize. Subtrees bigger than
+// this size will not be processed by the optimal algorithm.
+
+#ifndef DEST
+
+void f1() { {;} {{;}} }
+
+void f2() { {;} {{;;;;;}} }
+
+void f3() { {;} {{;;;;;;}} }
+
+#else
+
+void f1() {
+// Jaccard similarity = 3 / (5 + 4 - 3) = 3 / 6 >= 0.5
+// The optimal matching algorithm should move the ; into the outer block
+// CHECK: Match CompoundStmt(2) to CompoundStmt(2)
+// CHECK-NOT: Match CompoundStmt(3)
+// CHECK-NEXT: Match NullStmt(4) to NullStmt(3)
+  ; {{;}}
+}
+
+void f2() {
+  // Jaccard similarity = 7 / (10 + 10 - 7) >= 0.5
+  // As none of the subtrees is bigger than 10 nodes, the optimal algorithm
+  // will be run.
+  // CHECK: Match NullStmt(11) to NullStmt(9)
+  ;; {{;;;;;}}
+}
+
+void f3() {
+  // Jaccard similarity = 8 / (11 + 11 - 8) >= 0.5
+  // As the subtrees are bigger than 10 nodes, the optimal algorithm will not
+  // be run.
+  // CHECK: Delete NullStmt(22)
+  ;; {{;;;;;;}}
+}
+#endif
diff --git a/test/Tooling/clang-diff-topdown.cpp b/test/Tooling/clang-diff-topdown.cpp
new file mode 100644
index 00000000000..55553899392
--- /dev/null
+++ b/test/Tooling/clang-diff-topdown.cpp
@@ -0,0 +1,48 @@
+// RUN: %clang_cc1 -E %s > %t.src.cpp
+// RUN: %clang_cc1 -E %s > %t.dst.cpp -DDEST
+// RUN: clang-diff -dump-matches -stop-after=topdown %t.src.cpp %t.dst.cpp -- | FileCheck %s
+//
+// Test the top-down matching of identical subtrees only.
+
+#ifndef DEST
+
+void f1()
+{
+  // Match some subtree of height greater than 2.
+  // CHECK: Match CompoundStmt(3) to CompoundStmt(3)
+  // CHECK: Match CompoundStmt(4) to CompoundStmt(4)
+  // CHECK: Match NullStmt(5) to NullStmt(5)
+  {{;}}
+
+  // Don't match subtrees that are smaller.
+  // CHECK-NOT: Match CompoundStmt(6)
+  // CHECK-NOT: Match NullStmt(7)
+  {;}
+
+  // Greedy approach - use the first matching subtree when there are multiple
+  // identical subtrees.
+  // CHECK: Match CompoundStmt(8) to CompoundStmt(8)
+  // CHECK: Match CompoundStmt(9) to CompoundStmt(9)
+  // CHECK: Match NullStmt(10) to NullStmt(10)
+  {{;;}}
+}
+
+#else
+
+void f1() {
+
+  {{;}}
+
+  {;}
+
+  {{;;}}
+  // CHECK-NOT: Match {{.*}} to CompoundStmt(11)
+  // CHECK-NOT: Match {{.*}} to CompoundStmt(12)
+  // CHECK-NOT: Match {{.*}} to NullStmt(13)
+  {{;;}}
+
+  // CHECK-NOT: Match {{.*}} to NullStmt(14)
+  ;
+}
+
+#endif
diff --git a/tools/clang-diff/ClangDiff.cpp b/tools/clang-diff/ClangDiff.cpp
index 7d6d9c81966..0bfa4558fe1 100644
--- a/tools/clang-diff/ClangDiff.cpp
+++ b/tools/clang-diff/ClangDiff.cpp
@@ -50,6 +50,11 @@ static cl::opt<std::string> DestinationPath(cl::Positional,
                                             cl::Optional,
                                             cl::cat(ClangDiffCategory));
 
+static cl::opt<std::string> StopAfter("stop-after",
+                                      cl::desc("<topdown|bottomup>"),
+                                      cl::Optional, cl::init(""),
+                                      cl::cat(ClangDiffCategory));
+
 static cl::opt<int> MaxSize("s", cl::desc("<maxsize>"), cl::Optional,
                             cl::init(-1), cl::cat(ClangDiffCategory));
 
@@ -425,6 +430,14 @@ int main(int argc, const char **argv) {
   diff::ComparisonOptions Options;
   if (MaxSize != -1)
     Options.MaxSize = MaxSize;
+  if (!StopAfter.empty()) {
+    if (StopAfter == "topdown")
+      Options.StopAfterTopDown = true;
+    else if (StopAfter != "bottomup") {
+      llvm::errs() << "Error: Invalid argument for -stop-after\n";
+      return 1;
+    }
+  }
   diff::SyntaxTree SrcTree(Src->getASTContext());
   diff::SyntaxTree DstTree(Dst->getASTContext());
   diff::ASTDiff Diff(SrcTree, DstTree, Options);
-- 
GitLab