Skip to content
Snippets Groups Projects
Commit 7459a06d authored by Jim Wallace's avatar Jim Wallace
Browse files

Removed durableHNSW tests

parent f7686c69
No related branches found
No related tags found
1 merge request!15Add interface for using generic CoreML LLMs
Pipeline #116105 passed with warnings
...@@ -46,9 +46,9 @@ class CoreMLEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder { ...@@ -46,9 +46,9 @@ class CoreMLEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder {
var model: String var model: String
required init() { required init() {
zeroes = [] zeroes = Array(repeating: Scalar(0), count: 384)
dimensions = 0 dimensions = 384
model = "all-MiniLM-L6-v2" model = "all_MiniLM_L6_v2"
} }
......
#if os(macOS) //#if os(macOS)
import XCTest //import XCTest
import Foundation //import Foundation
import CoreLMDB //import CoreLMDB
import System //import System
@testable import SwiftNLP //@testable import SwiftNLP
//
// MARK: These tests are not to be included within the pipeline //// MARK: These tests are not to be included within the pipeline
//
final class DurableHNSWCorpusTests: XCTestCase { //final class DurableHNSWCorpusTests: XCTestCase {
/// This is used to skip these tests in the GitLab pipeline // /// This is used to skip these tests in the GitLab pipeline
override class var defaultTestSuite: XCTestSuite { // override class var defaultTestSuite: XCTestSuite {
if ProcessInfo.processInfo.environment["SKIP_TESTS"] == "DurableHNSWCorpusTests" { // if ProcessInfo.processInfo.environment["SKIP_TESTS"] == "DurableHNSWCorpusTests" {
return XCTestSuite(name: "Empty") // return XCTestSuite(name: "Empty")
} // }
return super.defaultTestSuite // return super.defaultTestSuite
} // }
//
/// Setting up constants for environment // /// Setting up constants for environment
private let ONE_GB: Int = 1_073_741_824 // private let ONE_GB: Int = 1_073_741_824
private let ONE_MB: Int = 1_048_576 // private let ONE_MB: Int = 1_048_576
private let ONE_KB: Int = 1_024 // private let ONE_KB: Int = 1_024
private let ONE_B: Int = 1 // private let ONE_B: Int = 1
private let DEFAULT_MAXREADERS: UInt32 = 126 // private let DEFAULT_MAXREADERS: UInt32 = 126
private let DEFAULT_MAXDBS: UInt32 = 10 // private let DEFAULT_MAXDBS: UInt32 = 10
//
/// Setting up working directory // /// Setting up working directory
private var workingDirectoryPath: FilePath! // private var workingDirectoryPath: FilePath!
//
override func setUpWithError() throws { // override func setUpWithError() throws {
try super.setUpWithError() // try super.setUpWithError()
//
let fileManager = FileManager.default
let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
workingDirectoryPath = FilePath(directoryURL.path)
/// This commented out code alternatively works in the XCode bundle resource environment
// guard let resourcesPath = Bundle.module.resourcePath else { fatalError("Failed to find resource path.") }
// let resourcesDirectoryURL = URL(fileURLWithPath: resourcesPath).appendingPathComponent("lmdb")
// let fileManager = FileManager.default // let fileManager = FileManager.default
// try fileManager.createDirectory(at: resourcesDirectoryURL, withIntermediateDirectories: true, attributes: nil) // let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
// print("Resources directory: \(resourcesDirectoryURL)") // try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
// workingDirectoryPath = FilePath(resourcesDirectoryURL.path) // workingDirectoryPath = FilePath(directoryURL.path)
} //
// /// This commented out code alternatively works in the XCode bundle resource environment
func testBuildBasicCorpus() throws { //// guard let resourcesPath = Bundle.module.resourcePath else { fatalError("Failed to find resource path.") }
let docs = [ //// let resourcesDirectoryURL = URL(fileURLWithPath: resourcesPath).appendingPathComponent("lmdb")
"CNTK formerly known as Computational Network Toolkit", //// let fileManager = FileManager.default
"is a free easy-to-use open-source commercial-grade toolkit", //// try fileManager.createDirectory(at: resourcesDirectoryURL, withIntermediateDirectories: true, attributes: nil)
"that enable us to train deep learning algorithms to learn like the human brain." //// print("Resources directory: \(resourcesDirectoryURL)")
] //// workingDirectoryPath = FilePath(resourcesDirectoryURL.path)
// }
/// Setting up the environment //
let env = try Environment() // func testBuildBasicCorpus() throws {
try env.setMapSize(ONE_GB) // let docs = [
try env.setMaxReaders(DEFAULT_MAXREADERS) // "CNTK formerly known as Computational Network Toolkit",
try env.setMaxDBs(DEFAULT_MAXDBS) // "is a free easy-to-use open-source commercial-grade toolkit",
try env.open(path: workingDirectoryPath) // "that enable us to train deep learning algorithms to learn like the human brain."
// ]
/// Writing to LMDB //
let transaction = try Transaction.begin(.write, in: env) // /// Setting up the environment
// let env = try Environment()
let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>( // try env.setMapSize(ONE_GB)
namespace: "testBasicExample", // try env.setMaxReaders(DEFAULT_MAXREADERS)
in: transaction // try env.setMaxDBs(DEFAULT_MAXDBS)
) // try env.open(path: workingDirectoryPath)
//
for doc in docs { // /// Writing to LMDB
try corpus.addUntokenizedDocument(doc, in: transaction) // let transaction = try Transaction.begin(.write, in: env)
} //
// let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
try transaction.commit() // namespace: "testBasicExample",
// in: transaction
/// Reading from LMDB // )
let readTransaction = try Transaction.begin(.read, in: env) //
// for doc in docs {
let _ = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>( // try corpus.addUntokenizedDocument(doc, in: transaction)
namespace: "testBasicExample", // }
in: readTransaction //
) // try transaction.commit()
//
readTransaction.abort() // /// Reading from LMDB
// let readTransaction = try Transaction.begin(.read, in: env)
// XCTAssert(readCorpus.count == 3) //
/// readCorpus.count == 3 will fail because we have not fixed the bug with setting size upon reads // let _ = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
/// This is because size is only incremented when insertion is called but it is not called when read from disk! // namespace: "testBasicExample",
} // in: readTransaction
// )
func testQueryBasicCorpus() async throws { //
let docs = [ // readTransaction.abort()
"The quick brown fox jumps over the lazy dog", //
"I enjoy taking long walks along the beach at sunset", // // XCTAssert(readCorpus.count == 3)
"Advances in neural networks have enabled new AI capabilities", // /// readCorpus.count == 3 will fail because we have not fixed the bug with setting size upon reads
"The stock market experienced a significant downturn last week", // /// This is because size is only incremented when insertion is called but it is not called when read from disk!
"Cooking a good meal can be both an art and a science", // }
"The exploration of space is both challenging and rewarding", //
"Machine learning models are becoming increasingly sophisticated", // func testQueryBasicCorpus() async throws {
"I love reading about history and ancient civilizations" // let docs = [
] // "The quick brown fox jumps over the lazy dog",
// "I enjoy taking long walks along the beach at sunset",
let query = "I like to read about new technology and artificial intelligence" // "Advances in neural networks have enabled new AI capabilities",
//let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) // "The stock market experienced a significant downturn last week",
// "Cooking a good meal can be both an art and a science",
/// Setting up the environment // "The exploration of space is both challenging and rewarding",
let env = try Environment() // "Machine learning models are becoming increasingly sophisticated",
try env.setMapSize(ONE_GB) // "I love reading about history and ancient civilizations"
try env.setMaxReaders(DEFAULT_MAXREADERS) // ]
try env.setMaxDBs(DEFAULT_MAXDBS) //
try env.open(path: workingDirectoryPath) // let query = "I like to read about new technology and artificial intelligence"
// //let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
let transaction = try Transaction.begin(.write, in: env) //
// /// Setting up the environment
/// Saving the memory map to disk // let env = try Environment()
let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>( // try env.setMapSize(ONE_GB)
namespace: "testBasicQueryExample", // try env.setMaxReaders(DEFAULT_MAXREADERS)
in: transaction // try env.setMaxDBs(DEFAULT_MAXDBS)
) // try env.open(path: workingDirectoryPath)
//
for doc in docs { // let transaction = try Transaction.begin(.write, in: env)
try corpus.addUntokenizedDocument(doc, in: transaction) //
} // /// Saving the memory map to disk
// let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap") // namespace: "testBasicQueryExample",
// in: transaction
try transaction.commit() // )
//
do { // for doc in docs {
let queryVector: [Double] = corpus.documentEncoder.encodeToken(query).map { Double($0) } // try corpus.addUntokenizedDocument(doc, in: transaction)
// }
/// Reading the memory map (and dictionary) from disk //
let readTransaction = try Transaction.begin(.write, in: env) // corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
//
let readCorpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>( // try transaction.commit()
namespace: "testBasicQueryExample", //
in: readTransaction // do {
) // let queryVector: [Double] = corpus.documentEncoder.encodeToken(query).map { Double($0) }
//
readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") // TODO: move this to initializer? // /// Reading the memory map (and dictionary) from disk
// let readTransaction = try Transaction.begin(.write, in: env)
let result = try readCorpus.index.find(near: queryVector, limit: 8, in: transaction) //
// let readCorpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
for result in result { // namespace: "testBasicQueryExample",
let key = Int(result.id.foreignKey)! // in: readTransaction
print(readCorpus.getUntokenizedDocument(at: key)) // )
} //
} catch { // readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") // TODO: move this to initializer?
print("Error when trying corpus.encodedDocuments.find(): \(error)") //
} // let result = try readCorpus.index.find(near: queryVector, limit: 8, in: transaction)
//
try transaction.commit() // for result in result {
} // let key = Int(result.id.foreignKey)!
// print(readCorpus.getUntokenizedDocument(at: key))
func testBuildGuelphSubredditCorpus() async throws { // }
/// Generates the LMDB durable storage to disk but runs no tests otherwise // } catch {
// print("Error when trying corpus.encodedDocuments.find(): \(error)")
/// Setting up the environment // }
let env = try Environment() //
try env.setMapSize(ONE_GB) // try transaction.commit()
try env.setMaxReaders(DEFAULT_MAXREADERS) // }
try env.setMaxDBs(DEFAULT_MAXDBS) //
try env.open(path: workingDirectoryPath) // func testBuildGuelphSubredditCorpus() async throws {
// /// Generates the LMDB durable storage to disk but runs no tests otherwise
/// Get subreddit data //
guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else { // /// Setting up the environment
fatalError("Failed to find waterloo_submissions.zst in test bundle.") // let env = try Environment()
} // try env.setMapSize(ONE_GB)
guard let submissionsData = try? Data(contentsOf: submissionsURL) else { // try env.setMaxReaders(DEFAULT_MAXREADERS)
fatalError("Failed to load waterloo_submissions.zst from test bundle.") // try env.setMaxDBs(DEFAULT_MAXDBS)
} // try env.open(path: workingDirectoryPath)
//
let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData) // /// Get subreddit data
// guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
let transaction = try Transaction.begin(.write, in: env) // fatalError("Failed to find waterloo_submissions.zst in test bundle.")
// }
let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) // guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
// fatalError("Failed to load waterloo_submissions.zst from test bundle.")
let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>( // }
encoder: documentEncoder, //
namespace: "subreddit_durable", // let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
in: transaction //
) // let transaction = try Transaction.begin(.write, in: env)
//
/// Add documents to corpus // let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
for submission in submissions { //
if let text = submission.selftext { // let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>(
try corpus.addUntokenizedDocument(text, in: transaction) // encoder: documentEncoder,
} // namespace: "subreddit_durable",
} // in: transaction
// )
/// Save dictionary to disk //
corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap") // /// Add documents to corpus
// for submission in submissions {
try transaction.commit() // if let text = submission.selftext {
} // try corpus.addUntokenizedDocument(text, in: transaction)
// }
func testQueryGuelphSubredditCorpus() async throws { // }
let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) //
// /// Save dictionary to disk
/// Setting up the environment // corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
let env = try Environment() //
try env.setMapSize(ONE_GB) // try transaction.commit()
try env.setMaxReaders(DEFAULT_MAXREADERS) // }
try env.setMaxDBs(DEFAULT_MAXDBS) //
try env.open(path: workingDirectoryPath) // func testQueryGuelphSubredditCorpus() async throws {
// let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
/// Reading the memory map (and dictionary) from disk //
let transaction = try Transaction.begin(.read, in: env) // /// Setting up the environment
// let env = try Environment()
let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>( // try env.setMapSize(ONE_GB)
encoder: documentEncoder, // try env.setMaxReaders(DEFAULT_MAXREADERS)
namespace: "subreddit_durable", // try env.setMaxDBs(DEFAULT_MAXDBS)
in: transaction // try env.open(path: workingDirectoryPath)
) //
// /// Reading the memory map (and dictionary) from disk
corpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") // let transaction = try Transaction.begin(.read, in: env)
//
let query = "I love waterloo and I love the geese." // let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>(
let queryVector: [Double] = documentEncoder.encodeToken(query).map { Double($0) } // encoder: documentEncoder,
// namespace: "subreddit_durable",
let result = try corpus.index.find(near: queryVector, limit: 8, in: transaction) // in: transaction
// )
for result in result { //
let key = Int(result.id.foreignKey)! // corpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap")
print(corpus.getUntokenizedDocument(at: key)) //
} // let query = "I love waterloo and I love the geese."
} // let queryVector: [Double] = documentEncoder.encodeToken(query).map { Double($0) }
} //
#endif // let result = try corpus.index.find(near: queryVector, limit: 8, in: transaction)
//
// for result in result {
// let key = Int(result.id.foreignKey)!
// print(corpus.getUntokenizedDocument(at: key))
// }
// }
//}
//#endif
//
...@@ -29,7 +29,7 @@ final class BERT_test: XCTestCase { ...@@ -29,7 +29,7 @@ final class BERT_test: XCTestCase {
"I like to read about new technology and artificial intelligence" "I like to read about new technology and artificial intelligence"
] ]
for model in ["gte-small", "all_MiniLM_L6_v2"] { for model in ["all_MiniLM_L6_v2"] {
var database_embedding: [[Float]] = [] var database_embedding: [[Float]] = []
var query_embedding: [Float] = [] var query_embedding: [Float] = []
var embedding_dim: Int = 384 var embedding_dim: Int = 384
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment