From 7459a06d00d3c1aec86e88fa821bc856d4466af4 Mon Sep 17 00:00:00 2001
From: Jim Wallace <james.wallace@uwaterloo.ca>
Date: Fri, 5 Apr 2024 15:34:32 -0400
Subject: [PATCH] Removed durableHNSW tests

---
 .../SwiftNLP/2. Encoding/CoreMLEncoder.swift  |   6 +-
 .../HNSW/DurableHNSWCorpusTests.swift         | 458 +++++++++---------
 .../SwiftNLPTests/AllMiniLM_sampleTest.swift  |   2 +-
 3 files changed, 233 insertions(+), 233 deletions(-)

diff --git a/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift b/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift
index 311ba600..b98aa739 100644
--- a/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift	
+++ b/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift	
@@ -46,9 +46,9 @@ class CoreMLEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder {
     var model: String
         
     required init() {
-      zeroes = []
-      dimensions = 0
-      model = "all-MiniLM-L6-v2"
+      zeroes = Array(repeating: Scalar(0), count: 384)
+      dimensions = 384
+      model = "all_MiniLM_L6_v2"
     }
     
     
diff --git a/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift b/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift
index 358e0f9e..545a525e 100644
--- a/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift	
+++ b/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift	
@@ -1,230 +1,230 @@
-#if os(macOS)
-import XCTest
-import Foundation
-import CoreLMDB
-import System
-@testable import SwiftNLP
-
-// MARK: These tests are not to be included within the pipeline
-
-final class DurableHNSWCorpusTests: XCTestCase {
-    /// This is used to skip these tests in the GitLab pipeline
-    override class var defaultTestSuite: XCTestSuite {
-        if ProcessInfo.processInfo.environment["SKIP_TESTS"] == "DurableHNSWCorpusTests" {
-            return XCTestSuite(name: "Empty")
-        }
-        return super.defaultTestSuite
-    }
-    
-    /// Setting up constants for environment
-    private let ONE_GB: Int = 1_073_741_824
-    private let ONE_MB: Int = 1_048_576
-    private let ONE_KB: Int = 1_024
-    private let ONE_B:  Int = 1
-    private let DEFAULT_MAXREADERS: UInt32 = 126
-    private let DEFAULT_MAXDBS:     UInt32 = 10
-    
-    /// Setting up working directory
-    private var workingDirectoryPath: FilePath!
-    
-    override func setUpWithError() throws {
-        try super.setUpWithError()
-        
-        let fileManager = FileManager.default
-        let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
-        try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
-        workingDirectoryPath = FilePath(directoryURL.path)
-        
-        /// This commented out code alternatively works in the XCode bundle resource environment
-//        guard let resourcesPath = Bundle.module.resourcePath else { fatalError("Failed to find resource path.") }
-//        let resourcesDirectoryURL = URL(fileURLWithPath: resourcesPath).appendingPathComponent("lmdb")
+//#if os(macOS)
+//import XCTest
+//import Foundation
+//import CoreLMDB
+//import System
+//@testable import SwiftNLP
+//
+//// MARK: These tests are not to be included within the pipeline
+//
+//final class DurableHNSWCorpusTests: XCTestCase {
+//    /// This is used to skip these tests in the GitLab pipeline
+//    override class var defaultTestSuite: XCTestSuite {
+//        if ProcessInfo.processInfo.environment["SKIP_TESTS"] == "DurableHNSWCorpusTests" {
+//            return XCTestSuite(name: "Empty")
+//        }
+//        return super.defaultTestSuite
+//    }
+//    
+//    /// Setting up constants for environment
+//    private let ONE_GB: Int = 1_073_741_824
+//    private let ONE_MB: Int = 1_048_576
+//    private let ONE_KB: Int = 1_024
+//    private let ONE_B:  Int = 1
+//    private let DEFAULT_MAXREADERS: UInt32 = 126
+//    private let DEFAULT_MAXDBS:     UInt32 = 10
+//    
+//    /// Setting up working directory
+//    private var workingDirectoryPath: FilePath!
+//    
+//    override func setUpWithError() throws {
+//        try super.setUpWithError()
+//        
 //        let fileManager = FileManager.default
-//        try fileManager.createDirectory(at: resourcesDirectoryURL, withIntermediateDirectories: true, attributes: nil)
-//        print("Resources directory: \(resourcesDirectoryURL)")
-//        workingDirectoryPath = FilePath(resourcesDirectoryURL.path)
-    }
-    
-    func testBuildBasicCorpus() throws {
-        let docs = [
-            "CNTK formerly known as Computational Network Toolkit",
-            "is a free easy-to-use open-source commercial-grade toolkit",
-            "that enable us to train deep learning algorithms to learn like the human brain."
-        ]
-        
-        /// Setting up the environment
-        let env = try Environment()
-        try env.setMapSize(ONE_GB)
-        try env.setMaxReaders(DEFAULT_MAXREADERS)
-        try env.setMaxDBs(DEFAULT_MAXDBS)
-        try env.open(path: workingDirectoryPath)
-        
-        /// Writing to LMDB
-        let transaction = try Transaction.begin(.write, in: env)
-
-        let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
-            namespace: "testBasicExample",
-            in: transaction
-        )
-        
-        for doc in docs {
-            try corpus.addUntokenizedDocument(doc, in: transaction)
-        }
-        
-        try transaction.commit()
-        
-        /// Reading from LMDB
-        let readTransaction = try Transaction.begin(.read, in: env)
-        
-        let _ = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
-            namespace: "testBasicExample",
-            in: readTransaction
-        )
-        
-        readTransaction.abort()
-        
-        // XCTAssert(readCorpus.count == 3)
-        /// readCorpus.count == 3 will fail because we have not fixed the bug with setting size upon reads
-        /// This is because size is only incremented when insertion is called but it is not called when read from disk!
-    }
-    
-    func testQueryBasicCorpus() async throws {
-        let docs = [
-            "The quick brown fox jumps over the lazy dog",
-            "I enjoy taking long walks along the beach at sunset",
-            "Advances in neural networks have enabled new AI capabilities",
-            "The stock market experienced a significant downturn last week",
-            "Cooking a good meal can be both an art and a science",
-            "The exploration of space is both challenging and rewarding",
-            "Machine learning models are becoming increasingly sophisticated",
-            "I love reading about history and ancient civilizations"
-        ]
-        
-        let query = "I like to read about new technology and artificial intelligence"
-        //let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
-        
-        /// Setting up the environment
-        let env = try Environment()
-        try env.setMapSize(ONE_GB)
-        try env.setMaxReaders(DEFAULT_MAXREADERS)
-        try env.setMaxDBs(DEFAULT_MAXDBS)
-        try env.open(path: workingDirectoryPath)
-        
-        let transaction = try Transaction.begin(.write, in: env)
-        
-        /// Saving the memory map to disk
-        let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
-            namespace: "testBasicQueryExample",
-            in: transaction
-        )
-        
-        for doc in docs {
-            try corpus.addUntokenizedDocument(doc, in: transaction)
-        }
-        
-        corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
-        
-        try transaction.commit()
-        
-        do {
-            let queryVector: [Double] = corpus.documentEncoder.encodeToken(query).map { Double($0) }
-            
-            /// Reading the memory map (and dictionary) from disk
-            let readTransaction = try Transaction.begin(.write, in: env)
-            
-            let readCorpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
-                namespace: "testBasicQueryExample",
-                in: readTransaction
-            )
-            
-            readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") // TODO: move this to initializer?
-            
-            let result = try readCorpus.index.find(near: queryVector, limit: 8, in: transaction)
-            
-            for result in result {
-                let key = Int(result.id.foreignKey)!
-                print(readCorpus.getUntokenizedDocument(at: key))
-            }
-        } catch {
-            print("Error when trying corpus.encodedDocuments.find(): \(error)")
-        }
-        
-        try transaction.commit()
-    }
-    
-    func testBuildGuelphSubredditCorpus() async throws {
-        /// Generates the LMDB durable storage to disk but runs no tests otherwise
-    
-        /// Setting up the environment
-        let env = try Environment()
-        try env.setMapSize(ONE_GB)
-        try env.setMaxReaders(DEFAULT_MAXREADERS)
-        try env.setMaxDBs(DEFAULT_MAXDBS)
-        try env.open(path: workingDirectoryPath)
-        
-        /// Get subreddit data
-        guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
-            fatalError("Failed to find waterloo_submissions.zst in test bundle.")
-        }
-        guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
-            fatalError("Failed to load waterloo_submissions.zst from test bundle.")
-        }
-
-        let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
-        
-        let transaction = try Transaction.begin(.write, in: env)
-        
-        let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
-        
-        let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>(
-            encoder: documentEncoder,
-            namespace: "subreddit_durable",
-            in: transaction
-        )
-
-        /// Add documents to corpus
-        for submission in submissions {
-            if let text = submission.selftext {
-                try corpus.addUntokenizedDocument(text, in: transaction)
-            }
-        }
-
-        /// Save dictionary to disk
-        corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
-        
-        try transaction.commit()
-    }
-    
-    func testQueryGuelphSubredditCorpus() async throws {
-        let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
-        
-        /// Setting up the environment
-        let env = try Environment()
-        try env.setMapSize(ONE_GB)
-        try env.setMaxReaders(DEFAULT_MAXREADERS)
-        try env.setMaxDBs(DEFAULT_MAXDBS)
-        try env.open(path: workingDirectoryPath)
-        
-        /// Reading the memory map (and dictionary) from disk
-        let transaction = try Transaction.begin(.read, in: env)
-        
-        let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>(
-            encoder: documentEncoder,
-            namespace: "subreddit_durable",
-            in: transaction
-        )
-        
-        corpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap")
-        
-        let query = "I love waterloo and I love the geese."
-        let queryVector: [Double] = documentEncoder.encodeToken(query).map { Double($0) }
-        
-        let result = try corpus.index.find(near: queryVector, limit: 8, in: transaction)
-        
-        for result in result {
-            let key = Int(result.id.foreignKey)!
-            print(corpus.getUntokenizedDocument(at: key))
-        }
-    }
-}
-#endif
-
+//        let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
+//        try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
+//        workingDirectoryPath = FilePath(directoryURL.path)
+//        
+//        /// This commented out code alternatively works in the XCode bundle resource environment
+////        guard let resourcesPath = Bundle.module.resourcePath else { fatalError("Failed to find resource path.") }
+////        let resourcesDirectoryURL = URL(fileURLWithPath: resourcesPath).appendingPathComponent("lmdb")
+////        let fileManager = FileManager.default
+////        try fileManager.createDirectory(at: resourcesDirectoryURL, withIntermediateDirectories: true, attributes: nil)
+////        print("Resources directory: \(resourcesDirectoryURL)")
+////        workingDirectoryPath = FilePath(resourcesDirectoryURL.path)
+//    }
+//    
+//    func testBuildBasicCorpus() throws {
+//        let docs = [
+//            "CNTK formerly known as Computational Network Toolkit",
+//            "is a free easy-to-use open-source commercial-grade toolkit",
+//            "that enable us to train deep learning algorithms to learn like the human brain."
+//        ]
+//        
+//        /// Setting up the environment
+//        let env = try Environment()
+//        try env.setMapSize(ONE_GB)
+//        try env.setMaxReaders(DEFAULT_MAXREADERS)
+//        try env.setMaxDBs(DEFAULT_MAXDBS)
+//        try env.open(path: workingDirectoryPath)
+//        
+//        /// Writing to LMDB
+//        let transaction = try Transaction.begin(.write, in: env)
+//
+//        let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
+//            namespace: "testBasicExample",
+//            in: transaction
+//        )
+//        
+//        for doc in docs {
+//            try corpus.addUntokenizedDocument(doc, in: transaction)
+//        }
+//        
+//        try transaction.commit()
+//        
+//        /// Reading from LMDB
+//        let readTransaction = try Transaction.begin(.read, in: env)
+//        
+//        let _ = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
+//            namespace: "testBasicExample",
+//            in: readTransaction
+//        )
+//        
+//        readTransaction.abort()
+//        
+//        // XCTAssert(readCorpus.count == 3)
+//        /// readCorpus.count == 3 will fail because we have not fixed the bug with setting size upon reads
+//        /// This is because size is only incremented when insertion is called but it is not called when read from disk!
+//    }
+//    
+//    func testQueryBasicCorpus() async throws {
+//        let docs = [
+//            "The quick brown fox jumps over the lazy dog",
+//            "I enjoy taking long walks along the beach at sunset",
+//            "Advances in neural networks have enabled new AI capabilities",
+//            "The stock market experienced a significant downturn last week",
+//            "Cooking a good meal can be both an art and a science",
+//            "The exploration of space is both challenging and rewarding",
+//            "Machine learning models are becoming increasingly sophisticated",
+//            "I love reading about history and ancient civilizations"
+//        ]
+//        
+//        let query = "I like to read about new technology and artificial intelligence"
+//        //let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
+//        
+//        /// Setting up the environment
+//        let env = try Environment()
+//        try env.setMapSize(ONE_GB)
+//        try env.setMaxReaders(DEFAULT_MAXREADERS)
+//        try env.setMaxDBs(DEFAULT_MAXDBS)
+//        try env.open(path: workingDirectoryPath)
+//        
+//        let transaction = try Transaction.begin(.write, in: env)
+//        
+//        /// Saving the memory map to disk
+//        let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
+//            namespace: "testBasicQueryExample",
+//            in: transaction
+//        )
+//        
+//        for doc in docs {
+//            try corpus.addUntokenizedDocument(doc, in: transaction)
+//        }
+//        
+//        corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
+//        
+//        try transaction.commit()
+//        
+//        do {
+//            let queryVector: [Double] = corpus.documentEncoder.encodeToken(query).map { Double($0) }
+//            
+//            /// Reading the memory map (and dictionary) from disk
+//            let readTransaction = try Transaction.begin(.write, in: env)
+//            
+//            let readCorpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
+//                namespace: "testBasicQueryExample",
+//                in: readTransaction
+//            )
+//            
+//            readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") // TODO: move this to initializer?
+//            
+//            let result = try readCorpus.index.find(near: queryVector, limit: 8, in: transaction)
+//            
+//            for result in result {
+//                let key = Int(result.id.foreignKey)!
+//                print(readCorpus.getUntokenizedDocument(at: key))
+//            }
+//        } catch {
+//            print("Error when trying corpus.encodedDocuments.find(): \(error)")
+//        }
+//        
+//        try transaction.commit()
+//    }
+//    
+//    func testBuildGuelphSubredditCorpus() async throws {
+//        /// Generates the LMDB durable storage to disk but runs no tests otherwise
+//    
+//        /// Setting up the environment
+//        let env = try Environment()
+//        try env.setMapSize(ONE_GB)
+//        try env.setMaxReaders(DEFAULT_MAXREADERS)
+//        try env.setMaxDBs(DEFAULT_MAXDBS)
+//        try env.open(path: workingDirectoryPath)
+//        
+//        /// Get subreddit data
+//        guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
+//            fatalError("Failed to find waterloo_submissions.zst in test bundle.")
+//        }
+//        guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
+//            fatalError("Failed to load waterloo_submissions.zst from test bundle.")
+//        }
+//
+//        let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
+//        
+//        let transaction = try Transaction.begin(.write, in: env)
+//        
+//        let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
+//        
+//        let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>(
+//            encoder: documentEncoder,
+//            namespace: "subreddit_durable",
+//            in: transaction
+//        )
+//
+//        /// Add documents to corpus
+//        for submission in submissions {
+//            if let text = submission.selftext {
+//                try corpus.addUntokenizedDocument(text, in: transaction)
+//            }
+//        }
+//
+//        /// Save dictionary to disk
+//        corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
+//        
+//        try transaction.commit()
+//    }
+//    
+//    func testQueryGuelphSubredditCorpus() async throws {
+//        let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
+//        
+//        /// Setting up the environment
+//        let env = try Environment()
+//        try env.setMapSize(ONE_GB)
+//        try env.setMaxReaders(DEFAULT_MAXREADERS)
+//        try env.setMaxDBs(DEFAULT_MAXDBS)
+//        try env.open(path: workingDirectoryPath)
+//        
+//        /// Reading the memory map (and dictionary) from disk
+//        let transaction = try Transaction.begin(.read, in: env)
+//        
+//        let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>(
+//            encoder: documentEncoder,
+//            namespace: "subreddit_durable",
+//            in: transaction
+//        )
+//        
+//        corpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap")
+//        
+//        let query = "I love waterloo and I love the geese."
+//        let queryVector: [Double] = documentEncoder.encodeToken(query).map { Double($0) }
+//        
+//        let result = try corpus.index.find(near: queryVector, limit: 8, in: transaction)
+//        
+//        for result in result {
+//            let key = Int(result.id.foreignKey)!
+//            print(corpus.getUntokenizedDocument(at: key))
+//        }
+//    }
+//}
+//#endif
+//
diff --git a/Tests/SwiftNLPTests/AllMiniLM_sampleTest.swift b/Tests/SwiftNLPTests/AllMiniLM_sampleTest.swift
index 026cfd9e..e1f236ea 100644
--- a/Tests/SwiftNLPTests/AllMiniLM_sampleTest.swift
+++ b/Tests/SwiftNLPTests/AllMiniLM_sampleTest.swift
@@ -29,7 +29,7 @@ final class BERT_test: XCTestCase {
             "I like to read about new technology and artificial intelligence"
         ]
         
-        for model in ["gte-small", "all_MiniLM_L6_v2"] {
+        for model in ["all_MiniLM_L6_v2"] {
             var database_embedding: [[Float]] = []
             var query_embedding: [Float] = []
             var embedding_dim: Int = 384
-- 
GitLab