From 95f078439efa2c55f740ff0c49764ee0ebe641a3 Mon Sep 17 00:00:00 2001
From: Mingchung Xia <mingchung.xia@gmail.com>
Date: Thu, 8 Feb 2024 10:34:38 -0500
Subject: [PATCH] HNSW fetching tests, pipeline

---
 .../SwiftNLPTests/2. Encoding/HNSWTests.swift |  53 ++++++-
 Tests/SwiftNLPTests/HNSWPipelineTest.swift    | 135 ++++++++++++++++++
 2 files changed, 184 insertions(+), 4 deletions(-)
 create mode 100644 Tests/SwiftNLPTests/HNSWPipelineTest.swift

diff --git a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift
index 1303379a..249939fe 100644
--- a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift	
+++ b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift	
@@ -103,6 +103,49 @@ final class HNSWTests: XCTestCase {
         XCTAssert(corpus.count == 17999)
     }
     
+    func testTypicalNeighborhoodSizes() throws {
+        // TODO: Debug - Fatal error: Double value cannot be converted to Int because it is outside the representable range
+//        let twentyQuotes = [
+//            "Imagination is more important than knowledge. - Albert Einstein",
+//            "The greatest enemy of knowledge is not ignorance, it is the illusion of knowledge. - Stephen Hawking",
+//            "If I have seen further it is by standing on the shoulders of giants. - Isaac Newton",
+//            "The universe is a wondrous place! The faster you create unbreakable code, the faster the universe creates people that can break it. - Richard Feynman",
+//            "Science is the belief in the ignorance of experts. - Richard Feynman",
+//            "The saddest aspect of life right now is that science gathers knowledge faster than society gathers wisdom. - Isaac Asimov",
+//            "Science is the poetry of reality. - Richard Dawkins",
+//            "To raise new questions, new possibilities, to regard old problems from a new angle, requires creative imagination and marks real advance in science. - Albert Einstein",
+//            "The scientist does not study nature because it is useful; he studies it because he delights in it, and he delights in it because it is beautiful. - Henri Poincaré",
+//            "Nothing in life is to be feared, it is only to be understood. Now is the time to understand more, so that we may fear less. - Marie Curie",
+//            "An experiment is a question which science poses to Nature, and a measurement is the recording of Nature’s answer. - Max Planck",
+//            "If you wish to make an apple pie from scratch, you must first invent the universe. - Carl Sagan",
+//            "The function of science fiction is not always to predict the future but sometimes to prevent it. - Frank Herbert",
+//            "Science is what we understand well enough to explain to a computer. Art is everything else we do. - Donald Knuth",
+//            "In science one tries to tell people, in such a way as to be understood by everyone, something that no one ever knew before. But in poetry, it's the exact opposite. - Paul Dirac",
+//            "Science is a way of thinking much more than it is a body of knowledge. - Carl Sagan",
+//            "Research is what I’m doing when I don’t know what I’m doing. - Wernher von Braun",
+//            "The most beautiful thing we can experience is the mysterious. It is the source of all true art and science. - Albert Einstein",
+//            "One, remember to look up at the stars and not down at your feet. Two, never give up work. Work gives you meaning and purpose and life is empty without it. Three, if you are lucky enough to find love, remember it is there and don't throw it away. - Stephen Hawking",
+//            "All science is either physics or stamp collecting. - Ernest Rutherford"
+//        ]
+//        
+//        let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
+//        
+//        for typicalNeighborhoodSize in 0..<100 {
+//            let start = Date()
+//            
+//            var corpus = HNSWCorpus(encoder: _documentEncoder, typicalNeighborhoodSize: typicalNeighborhoodSize)
+//            corpus.addUntokenizedDocuments(twentyQuotes)
+//            
+//            let end = Date()
+//            let runtime = end.timeIntervalSince(start)
+//            
+//            let size = MemoryLayout.size(ofValue: corpus)
+//            print("Typical Neighbor Size: \(typicalNeighborhoodSize)")
+//            print("Approximate memory footprint: \(size) bytes")
+//            print("Runtime: \(runtime) seconds")
+//        }
+    }
+    
     // Refer to AllMiniLM_sampleTest.swift for reference
     func testBasicQueryExample() async throws {
 
@@ -128,12 +171,14 @@ final class HNSWTests: XCTestCase {
         
         do {
             print("Attempting to query corpus.encodedDocuments.find()...")
-            let query_embedding: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
-            let results = try corpus.encodedDocuments.find(near: query_embedding, limit: 8)
+            
+            // TODO: Print this as a readable result - reverse encoding?
+            let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
+            let results = try corpus.encodedDocuments.find(near: queryVector, limit: 8)
             print(results)
-            print("Query successful!")
+            print("Query completed!")
         } catch {
-            print("Error when trying corpus.encodedDocuments.find: \(error)")
+            print("Error when trying corpus.encodedDocuments.find(): \(error)")
         }
     }
 }
diff --git a/Tests/SwiftNLPTests/HNSWPipelineTest.swift b/Tests/SwiftNLPTests/HNSWPipelineTest.swift
new file mode 100644
index 00000000..4c6d810f
--- /dev/null
+++ b/Tests/SwiftNLPTests/HNSWPipelineTest.swift
@@ -0,0 +1,135 @@
+//#if os(macOS)
+//import XCTest
+//import Foundation
+//import NaturalLanguage
+//@testable import SwiftNLP
+//
+//// MARK: See AllMiniLM_pipelineTest.swift
+//
+//final class HNSWPipelineTest: XCTestCase {
+//
+//    // test fetching names of all the files
+//    func testFileNameFetching() throws {
+//        let redditCommentNames = TestUtils.getJsonFiles(prefix: "RC")
+//        print("reddit comment files: \(redditCommentNames)")
+//        let redditSubmissionNames = TestUtils.getJsonFiles(prefix: "RS")
+//        print("reddit submission files: \(redditSubmissionNames)")
+//    }
+//
+//    // test reading reddit submission json files into actual objects
+//    func testRedditSubmissions() throws {
+//        let redditSubmissionJson = TestUtils.loadAllRedditSubmission()
+//        for jsonData in redditSubmissionJson {
+//            let redditSubmission = TestUtils.readRedditSubmissionJson(json: jsonData)
+//            XCTAssertNotNil(redditSubmission, "Failed to decode RedditSubmissionData")
+//        }
+//    }
+//
+//    // test reading reddit comment json files into actual objects
+//    func testRedditComments() throws {
+//        let redditCommentJson = TestUtils.loadAllRedditComment()
+//        for jsonData in redditCommentJson {
+//            let redditComment = TestUtils.readRedditCommentJson(json: jsonData)
+//            XCTAssertNotNil(redditComment, "Failed to decode RedditCommentData")
+//        }
+//    }
+//
+//    func test20kDownload() async throws {
+//
+//        let result = try await downloadSubredditFromServer(subreddit: "StopGaming")
+//        print("Loaded \(result.count) threads from server.")
+//        if let random = result.randomElement() {
+//            let (key, value) = random
+//            print("Key: \(key), Value: \(value)")
+//        }
+//        XCTAssertEqual(result.count, 34829, "Failed to load subreddit data from https://reddit-top20k.cworld.ai")
+//
+//    }
+//
+//
+//    func testDocumentReading() async throws {
+//        // loads all json data for test documents
+//        let redditCommentJson = TestUtils.loadAllRedditComment()
+//        let redditSubmissionJson = TestUtils.loadAllRedditSubmission()
+//
+//        let redditComments = redditCommentJson.compactMap { TestUtils.readRedditCommentJson(json: $0)}
+//        let redditSubmissions = redditSubmissionJson.compactMap { TestUtils.readRedditSubmissionJson(json: $0) }
+//
+//        var bodies: [String] = []
+//
+//        // load all the reddit comments' body as comment to the document
+//        for comment in redditComments {
+//            //debugPrint("Processing \(comment.posts.count) comments")
+//
+//            for post in comment.posts {
+//                if let body = post.body {
+//                    bodies.append(body)
+//                }
+//            }
+//        }
+//
+//        for submission in redditSubmissions {
+//            //debugPrint("Processing \(submission.posts.count) submissions")
+//
+//            for post in submission.posts {
+//                if let p = post.selftext {
+//                    //debugPrint(p)
+//                    bodies.append(p)
+//                }
+//            }
+//        }
+//
+//        // Debug code
+////        bodies = Array(bodies.prefix(10))
+////        print(bodies)
+//
+//        // start to encode the db and query
+////        var database_embedding: [[Float]] = []
+////        var query_embedding: [Float] = []
+////        let query = "stop playing video games"
+////        var embedding_dim: Int = 384
+////        var model = MiniLMEmbeddings()
+////        query_embedding = await model.encode(sentence: query)!
+////
+////        var i = 1
+////        //append sentence embedding to database_embedding
+////        for string in bodies {
+////            if let vector = await model.encode(sentence: string) {
+////                database_embedding.append(vector)
+////                //print(i)
+////                i += 1
+////            } else {
+////                fatalError("Error occurred1")
+////            }
+////
+////        }
+////        
+//        let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
+//        var corpus = HNSWCorpus(encoder: _documentEncoder)
+//        corpus.addUntokenizedDocuments(bodies)
+//        
+//        let size = MemoryLayout.size(ofValue: corpus)
+//        print("Approximate memory footprint: \(size) bytes")
+//        
+//        do {
+//            print("Attempting to query corpus.encodedDocuments.find()...")
+//            let query = "stop playing video games"
+//            let queryVector = _documentEncoder.encodeToken(query)
+//            let results = try corpus.encodedDocuments.find(near: queryVector, limit: 10)
+//            print(results)
+//            print("Query completed!")
+//        } catch {
+//            print("Error when trying corpus.encodedDocuments.find(): \(error)")
+//        }
+//
+////        let index = AnnoyIndex<Float>(itemLength: embedding_dim, metric: .euclidean)
+////
+////        try? index.addItems(items: &database_embedding)
+////        try? index.build(numTrees: 50)
+////
+////        let results = index.getNNsForVector(vector: &query_embedding, neighbors: 10)
+////
+////        print(results)
+//    }
+//}
+//#endif
-- 
GitLab