From 7459a06d00d3c1aec86e88fa821bc856d4466af4 Mon Sep 17 00:00:00 2001 From: Jim Wallace <james.wallace@uwaterloo.ca> Date: Fri, 5 Apr 2024 15:34:32 -0400 Subject: [PATCH] Removed durableHNSW tests --- .../SwiftNLP/2. Encoding/CoreMLEncoder.swift | 6 +- .../HNSW/DurableHNSWCorpusTests.swift | 458 +++++++++--------- .../SwiftNLPTests/AllMiniLM_sampleTest.swift | 2 +- 3 files changed, 233 insertions(+), 233 deletions(-) diff --git a/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift b/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift index 311ba600..b98aa739 100644 --- a/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift +++ b/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift @@ -46,9 +46,9 @@ class CoreMLEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder { var model: String required init() { - zeroes = [] - dimensions = 0 - model = "all-MiniLM-L6-v2" + zeroes = Array(repeating: Scalar(0), count: 384) + dimensions = 384 + model = "all_MiniLM_L6_v2" } diff --git a/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift b/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift index 358e0f9e..545a525e 100644 --- a/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift +++ b/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift @@ -1,230 +1,230 @@ -#if os(macOS) -import XCTest -import Foundation -import CoreLMDB -import System -@testable import SwiftNLP - -// MARK: These tests are not to be included within the pipeline - -final class DurableHNSWCorpusTests: XCTestCase { - /// This is used to skip these tests in the GitLab pipeline - override class var defaultTestSuite: XCTestSuite { - if ProcessInfo.processInfo.environment["SKIP_TESTS"] == "DurableHNSWCorpusTests" { - return XCTestSuite(name: "Empty") - } - return super.defaultTestSuite - } - - /// Setting up constants for environment - private let ONE_GB: Int = 1_073_741_824 - private let ONE_MB: Int = 1_048_576 - private let ONE_KB: Int = 1_024 - private let ONE_B: Int = 1 - private let DEFAULT_MAXREADERS: UInt32 = 126 - private let DEFAULT_MAXDBS: UInt32 = 10 - - /// Setting up working directory - private var workingDirectoryPath: FilePath! - - override func setUpWithError() throws { - try super.setUpWithError() - - let fileManager = FileManager.default - let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb") - try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil) - workingDirectoryPath = FilePath(directoryURL.path) - - /// This commented out code alternatively works in the XCode bundle resource environment -// guard let resourcesPath = Bundle.module.resourcePath else { fatalError("Failed to find resource path.") } -// let resourcesDirectoryURL = URL(fileURLWithPath: resourcesPath).appendingPathComponent("lmdb") +//#if os(macOS) +//import XCTest +//import Foundation +//import CoreLMDB +//import System +//@testable import SwiftNLP +// +//// MARK: These tests are not to be included within the pipeline +// +//final class DurableHNSWCorpusTests: XCTestCase { +// /// This is used to skip these tests in the GitLab pipeline +// override class var defaultTestSuite: XCTestSuite { +// if ProcessInfo.processInfo.environment["SKIP_TESTS"] == "DurableHNSWCorpusTests" { +// return XCTestSuite(name: "Empty") +// } +// return super.defaultTestSuite +// } +// +// /// Setting up constants for environment +// private let ONE_GB: Int = 1_073_741_824 +// private let ONE_MB: Int = 1_048_576 +// private let ONE_KB: Int = 1_024 +// private let ONE_B: Int = 1 +// private let DEFAULT_MAXREADERS: UInt32 = 126 +// private let DEFAULT_MAXDBS: UInt32 = 10 +// +// /// Setting up working directory +// private var workingDirectoryPath: FilePath! +// +// override func setUpWithError() throws { +// try super.setUpWithError() +// // let fileManager = FileManager.default -// try fileManager.createDirectory(at: resourcesDirectoryURL, withIntermediateDirectories: true, attributes: nil) -// print("Resources directory: \(resourcesDirectoryURL)") -// workingDirectoryPath = FilePath(resourcesDirectoryURL.path) - } - - func testBuildBasicCorpus() throws { - let docs = [ - "CNTK formerly known as Computational Network Toolkit", - "is a free easy-to-use open-source commercial-grade toolkit", - "that enable us to train deep learning algorithms to learn like the human brain." - ] - - /// Setting up the environment - let env = try Environment() - try env.setMapSize(ONE_GB) - try env.setMaxReaders(DEFAULT_MAXREADERS) - try env.setMaxDBs(DEFAULT_MAXDBS) - try env.open(path: workingDirectoryPath) - - /// Writing to LMDB - let transaction = try Transaction.begin(.write, in: env) - - let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>( - namespace: "testBasicExample", - in: transaction - ) - - for doc in docs { - try corpus.addUntokenizedDocument(doc, in: transaction) - } - - try transaction.commit() - - /// Reading from LMDB - let readTransaction = try Transaction.begin(.read, in: env) - - let _ = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>( - namespace: "testBasicExample", - in: readTransaction - ) - - readTransaction.abort() - - // XCTAssert(readCorpus.count == 3) - /// readCorpus.count == 3 will fail because we have not fixed the bug with setting size upon reads - /// This is because size is only incremented when insertion is called but it is not called when read from disk! - } - - func testQueryBasicCorpus() async throws { - let docs = [ - "The quick brown fox jumps over the lazy dog", - "I enjoy taking long walks along the beach at sunset", - "Advances in neural networks have enabled new AI capabilities", - "The stock market experienced a significant downturn last week", - "Cooking a good meal can be both an art and a science", - "The exploration of space is both challenging and rewarding", - "Machine learning models are becoming increasingly sophisticated", - "I love reading about history and ancient civilizations" - ] - - let query = "I like to read about new technology and artificial intelligence" - //let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) - - /// Setting up the environment - let env = try Environment() - try env.setMapSize(ONE_GB) - try env.setMaxReaders(DEFAULT_MAXREADERS) - try env.setMaxDBs(DEFAULT_MAXDBS) - try env.open(path: workingDirectoryPath) - - let transaction = try Transaction.begin(.write, in: env) - - /// Saving the memory map to disk - let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>( - namespace: "testBasicQueryExample", - in: transaction - ) - - for doc in docs { - try corpus.addUntokenizedDocument(doc, in: transaction) - } - - corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap") - - try transaction.commit() - - do { - let queryVector: [Double] = corpus.documentEncoder.encodeToken(query).map { Double($0) } - - /// Reading the memory map (and dictionary) from disk - let readTransaction = try Transaction.begin(.write, in: env) - - let readCorpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>( - namespace: "testBasicQueryExample", - in: readTransaction - ) - - readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") // TODO: move this to initializer? - - let result = try readCorpus.index.find(near: queryVector, limit: 8, in: transaction) - - for result in result { - let key = Int(result.id.foreignKey)! - print(readCorpus.getUntokenizedDocument(at: key)) - } - } catch { - print("Error when trying corpus.encodedDocuments.find(): \(error)") - } - - try transaction.commit() - } - - func testBuildGuelphSubredditCorpus() async throws { - /// Generates the LMDB durable storage to disk but runs no tests otherwise - - /// Setting up the environment - let env = try Environment() - try env.setMapSize(ONE_GB) - try env.setMaxReaders(DEFAULT_MAXREADERS) - try env.setMaxDBs(DEFAULT_MAXDBS) - try env.open(path: workingDirectoryPath) - - /// Get subreddit data - guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else { - fatalError("Failed to find waterloo_submissions.zst in test bundle.") - } - guard let submissionsData = try? Data(contentsOf: submissionsURL) else { - fatalError("Failed to load waterloo_submissions.zst from test bundle.") - } - - let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData) - - let transaction = try Transaction.begin(.write, in: env) - - let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) - - let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>( - encoder: documentEncoder, - namespace: "subreddit_durable", - in: transaction - ) - - /// Add documents to corpus - for submission in submissions { - if let text = submission.selftext { - try corpus.addUntokenizedDocument(text, in: transaction) - } - } - - /// Save dictionary to disk - corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap") - - try transaction.commit() - } - - func testQueryGuelphSubredditCorpus() async throws { - let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) - - /// Setting up the environment - let env = try Environment() - try env.setMapSize(ONE_GB) - try env.setMaxReaders(DEFAULT_MAXREADERS) - try env.setMaxDBs(DEFAULT_MAXDBS) - try env.open(path: workingDirectoryPath) - - /// Reading the memory map (and dictionary) from disk - let transaction = try Transaction.begin(.read, in: env) - - let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>( - encoder: documentEncoder, - namespace: "subreddit_durable", - in: transaction - ) - - corpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") - - let query = "I love waterloo and I love the geese." - let queryVector: [Double] = documentEncoder.encodeToken(query).map { Double($0) } - - let result = try corpus.index.find(near: queryVector, limit: 8, in: transaction) - - for result in result { - let key = Int(result.id.foreignKey)! - print(corpus.getUntokenizedDocument(at: key)) - } - } -} -#endif - +// let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb") +// try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil) +// workingDirectoryPath = FilePath(directoryURL.path) +// +// /// This commented out code alternatively works in the XCode bundle resource environment +//// guard let resourcesPath = Bundle.module.resourcePath else { fatalError("Failed to find resource path.") } +//// let resourcesDirectoryURL = URL(fileURLWithPath: resourcesPath).appendingPathComponent("lmdb") +//// let fileManager = FileManager.default +//// try fileManager.createDirectory(at: resourcesDirectoryURL, withIntermediateDirectories: true, attributes: nil) +//// print("Resources directory: \(resourcesDirectoryURL)") +//// workingDirectoryPath = FilePath(resourcesDirectoryURL.path) +// } +// +// func testBuildBasicCorpus() throws { +// let docs = [ +// "CNTK formerly known as Computational Network Toolkit", +// "is a free easy-to-use open-source commercial-grade toolkit", +// "that enable us to train deep learning algorithms to learn like the human brain." +// ] +// +// /// Setting up the environment +// let env = try Environment() +// try env.setMapSize(ONE_GB) +// try env.setMaxReaders(DEFAULT_MAXREADERS) +// try env.setMaxDBs(DEFAULT_MAXDBS) +// try env.open(path: workingDirectoryPath) +// +// /// Writing to LMDB +// let transaction = try Transaction.begin(.write, in: env) +// +// let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>( +// namespace: "testBasicExample", +// in: transaction +// ) +// +// for doc in docs { +// try corpus.addUntokenizedDocument(doc, in: transaction) +// } +// +// try transaction.commit() +// +// /// Reading from LMDB +// let readTransaction = try Transaction.begin(.read, in: env) +// +// let _ = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>( +// namespace: "testBasicExample", +// in: readTransaction +// ) +// +// readTransaction.abort() +// +// // XCTAssert(readCorpus.count == 3) +// /// readCorpus.count == 3 will fail because we have not fixed the bug with setting size upon reads +// /// This is because size is only incremented when insertion is called but it is not called when read from disk! +// } +// +// func testQueryBasicCorpus() async throws { +// let docs = [ +// "The quick brown fox jumps over the lazy dog", +// "I enjoy taking long walks along the beach at sunset", +// "Advances in neural networks have enabled new AI capabilities", +// "The stock market experienced a significant downturn last week", +// "Cooking a good meal can be both an art and a science", +// "The exploration of space is both challenging and rewarding", +// "Machine learning models are becoming increasingly sophisticated", +// "I love reading about history and ancient civilizations" +// ] +// +// let query = "I like to read about new technology and artificial intelligence" +// //let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) +// +// /// Setting up the environment +// let env = try Environment() +// try env.setMapSize(ONE_GB) +// try env.setMaxReaders(DEFAULT_MAXREADERS) +// try env.setMaxDBs(DEFAULT_MAXDBS) +// try env.open(path: workingDirectoryPath) +// +// let transaction = try Transaction.begin(.write, in: env) +// +// /// Saving the memory map to disk +// let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>( +// namespace: "testBasicQueryExample", +// in: transaction +// ) +// +// for doc in docs { +// try corpus.addUntokenizedDocument(doc, in: transaction) +// } +// +// corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap") +// +// try transaction.commit() +// +// do { +// let queryVector: [Double] = corpus.documentEncoder.encodeToken(query).map { Double($0) } +// +// /// Reading the memory map (and dictionary) from disk +// let readTransaction = try Transaction.begin(.write, in: env) +// +// let readCorpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>( +// namespace: "testBasicQueryExample", +// in: readTransaction +// ) +// +// readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") // TODO: move this to initializer? +// +// let result = try readCorpus.index.find(near: queryVector, limit: 8, in: transaction) +// +// for result in result { +// let key = Int(result.id.foreignKey)! +// print(readCorpus.getUntokenizedDocument(at: key)) +// } +// } catch { +// print("Error when trying corpus.encodedDocuments.find(): \(error)") +// } +// +// try transaction.commit() +// } +// +// func testBuildGuelphSubredditCorpus() async throws { +// /// Generates the LMDB durable storage to disk but runs no tests otherwise +// +// /// Setting up the environment +// let env = try Environment() +// try env.setMapSize(ONE_GB) +// try env.setMaxReaders(DEFAULT_MAXREADERS) +// try env.setMaxDBs(DEFAULT_MAXDBS) +// try env.open(path: workingDirectoryPath) +// +// /// Get subreddit data +// guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else { +// fatalError("Failed to find waterloo_submissions.zst in test bundle.") +// } +// guard let submissionsData = try? Data(contentsOf: submissionsURL) else { +// fatalError("Failed to load waterloo_submissions.zst from test bundle.") +// } +// +// let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData) +// +// let transaction = try Transaction.begin(.write, in: env) +// +// let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) +// +// let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>( +// encoder: documentEncoder, +// namespace: "subreddit_durable", +// in: transaction +// ) +// +// /// Add documents to corpus +// for submission in submissions { +// if let text = submission.selftext { +// try corpus.addUntokenizedDocument(text, in: transaction) +// } +// } +// +// /// Save dictionary to disk +// corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap") +// +// try transaction.commit() +// } +// +// func testQueryGuelphSubredditCorpus() async throws { +// let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) +// +// /// Setting up the environment +// let env = try Environment() +// try env.setMapSize(ONE_GB) +// try env.setMaxReaders(DEFAULT_MAXREADERS) +// try env.setMaxDBs(DEFAULT_MAXDBS) +// try env.open(path: workingDirectoryPath) +// +// /// Reading the memory map (and dictionary) from disk +// let transaction = try Transaction.begin(.read, in: env) +// +// let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>( +// encoder: documentEncoder, +// namespace: "subreddit_durable", +// in: transaction +// ) +// +// corpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") +// +// let query = "I love waterloo and I love the geese." +// let queryVector: [Double] = documentEncoder.encodeToken(query).map { Double($0) } +// +// let result = try corpus.index.find(near: queryVector, limit: 8, in: transaction) +// +// for result in result { +// let key = Int(result.id.foreignKey)! +// print(corpus.getUntokenizedDocument(at: key)) +// } +// } +//} +//#endif +// diff --git a/Tests/SwiftNLPTests/AllMiniLM_sampleTest.swift b/Tests/SwiftNLPTests/AllMiniLM_sampleTest.swift index 026cfd9e..e1f236ea 100644 --- a/Tests/SwiftNLPTests/AllMiniLM_sampleTest.swift +++ b/Tests/SwiftNLPTests/AllMiniLM_sampleTest.swift @@ -29,7 +29,7 @@ final class BERT_test: XCTestCase { "I like to read about new technology and artificial intelligence" ] - for model in ["gte-small", "all_MiniLM_L6_v2"] { + for model in ["all_MiniLM_L6_v2"] { var database_embedding: [[Float]] = [] var query_embedding: [Float] = [] var embedding_dim: Int = 384 -- GitLab