diff --git a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Dictionary.swift b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Dictionary.swift index e37154b4bbafa6fbf835f56920f7a7f0c66d7256..78c77324a197d25f694ebc78ee8fda17c5cf7f2f 100644 --- a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Dictionary.swift +++ b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Dictionary.swift @@ -32,9 +32,9 @@ extension DurableHNSWCorpus { /// This extension is used for the dictionary operations public struct DocumentVectorPair { var untokenizedDocument: String - var vector: [Scalar] + var vector: [Encoder.Scalar] - init(untokenizedDocument: String, vector: [Scalar]) { + init(untokenizedDocument: String, vector: [Encoder.Scalar]) { self.untokenizedDocument = untokenizedDocument self.vector = vector } @@ -50,7 +50,7 @@ extension DurableHNSWCorpus { } @inlinable - func getVector(at key: Int) -> [Scalar] { + func getVector(at key: Int) -> [Encoder.Scalar] { if let pair = dictionary[key] { return pair.vector } else { @@ -63,7 +63,7 @@ extension DurableHNSWCorpus { return dictionary } - func addDocumentVectorPair(at key: Int, document: String, vector: [Scalar]) { + func addDocumentVectorPair(at key: Int, document: String, vector: [Encoder.Scalar]) { dictionary[key] = DocumentVectorPair( untokenizedDocument: document, vector: vector diff --git a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + File IO.swift b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + File IO.swift index bb3c73bd4f65cedd3dccaece48918e9ce0605aa8..ec7c64d389ae089954f397787d68e82722b521c6 100644 --- a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + File IO.swift +++ b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + File IO.swift @@ -131,12 +131,12 @@ extension DurableHNSWCorpus { let vectorLength = vectorLengthData.withUnsafeBytes { $0.load(as: Int.self) } index += MemoryLayout<Int>.size - var vector = [Scalar]() + var vector = [Encoder.Scalar]() for _ in 0..<vectorLength { - let scalarData = data.subdata(in: index..<index+MemoryLayout<Scalar>.size) - let scalar = scalarData.withUnsafeBytes { $0.load(as: Scalar.self) } + let scalarData = data.subdata(in: index..<index+MemoryLayout<Encoder.Scalar>.size) + let scalar = scalarData.withUnsafeBytes { $0.load(as: Encoder.Scalar.self) } vector.append(scalar) - index += MemoryLayout<Scalar>.size + index += MemoryLayout<Encoder.Scalar>.size } // Add the key-value pair to the dictionary diff --git a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Sequence.swift b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Sequence.swift index 60cba61e65696d479268a9153fbbe54f541e8764..072b2b1d32ab05b38819f8905a9901cd2ac399d6 100644 --- a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Sequence.swift +++ b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Sequence.swift @@ -1,69 +1,69 @@ -//// Copyright (c) 2024 Jim Wallace -//// -//// Permission is hereby granted, free of charge, to any person -//// obtaining a copy of this software and associated documentation -//// files (the "Software"), to deal in the Software without -//// restriction, including without limitation the rights to use, -//// copy, modify, merge, publish, distribute, sublicense, and/or sell -//// copies of the Software, and to permit persons to whom the -//// Software is furnished to do so, subject to the following -//// conditions: -//// -//// The above copyright notice and this permission notice shall be -//// included in all copies or substantial portions of the Software. -//// -//// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -//// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -//// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -//// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -//// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -//// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -//// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -//// OTHER DEALINGS IN THE SOFTWARE. -//// -//// Created by Mingchung Xia on 2024-03-16. -//// +// Copyright (c) 2024 Jim Wallace // -//#if os(macOS) +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: // -//import Foundation +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. // -///// HNSWCorpus iterates through its dictionary of key to document vector pairs -// -//extension DurableHNSWCorpus: Sequence, Collection { -// // Sequence Protocol Requirements -// @inlinable -// func makeIterator() -> AnyIterator<DocumentVectorPair> { -// var iterator = dictionary.values.makeIterator() -// return AnyIterator { -// return iterator.next() -// } -// } -// -// // Collection Protocol Requirements -// @inlinable -// var startIndex: Int { -// return dictionary.keys.sorted().startIndex -// } -// -// @inlinable -// var endIndex: Int { -// return dictionary.keys.sorted().endIndex -// } -// -// @inlinable -// subscript(position: Int) -> DocumentVectorPair { -// let key = dictionary.keys.sorted()[position] -// guard let pair = dictionary[key] else { -// fatalError("Key \(key) not found in HNSW dictionary") -// } -// return pair -// } -// -// @inlinable -// func index(after i: Int) -> Int { -// return dictionary.keys.sorted().index(after: i) -// } -//} +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. // -//#endif +// Created by Mingchung Xia on 2024-03-16. +// + +#if os(macOS) + +import Foundation + +/// HNSWCorpus iterates through its dictionary of key to document vector pairs + +extension DurableHNSWCorpus: Sequence, Collection { + // Sequence Protocol Requirements + @inlinable + func makeIterator() -> AnyIterator<DocumentVectorPair> { + var iterator = dictionary.values.makeIterator() + return AnyIterator { + return iterator.next() + } + } + + // Collection Protocol Requirements + @inlinable + var startIndex: Int { + return dictionary.keys.sorted().startIndex + } + + @inlinable + var endIndex: Int { + return dictionary.keys.sorted().endIndex + } + + @inlinable + subscript(position: Int) -> DocumentVectorPair { + let key = dictionary.keys.sorted()[position] + guard let pair = dictionary[key] else { + fatalError("Key \(key) not found in HNSW dictionary") + } + return pair + } + + @inlinable + func index(after i: Int) -> Int { + return dictionary.keys.sorted().index(after: i) + } +} + +#endif diff --git a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift index 8cee16b50dfe33738910955e001eea49949023c9..00f49fdcf861e5debe31b564860376781fe3da5f 100644 --- a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift +++ b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift @@ -34,50 +34,87 @@ import CoreLMDBCoders // MARK: DurableHNSWCorpus cannot conform to SNLPCorpus under its current definition // This is because addingUntokenizedDocuments in a DurableHNSWCorpus requires an additional parameter (transaction) and can throw -final class DurableHNSWCorpus<Scalar: BinaryFloatingPoint & Codable & UnsafeMemoryLayoutStorableFloat> { +final class DurableHNSWCorpus<Item: SNLPDataItem, Encoder: SNLPEncoder>: SNLPCorpus where Encoder.Scalar: Codable & UnsafeMemoryLayoutStorableFloat { + + + public typealias HNSWDictionary = [Int: DocumentVectorPair] - internal var documentEncoder: any SNLPEncoder<Scalar> - var zeroes: [Scalar] { documentEncoder.zeroes } - var encodedDocuments: DeterministicDurableVectorIndex<Scalar> - var count: Int { encodedDocuments.size } + internal var documentEncoder: Encoder + internal var documents = ContiguousArray<Item>() + internal var encodedDocuments = ContiguousArray<[Encoder.Scalar]>() + + var index: DeterministicDurableVectorIndex<Encoder.Scalar> + + + private let ONE_GB: Int = 1_073_741_824 + private let ONE_MB: Int = 1_048_576 + private let ONE_KB: Int = 1_024 + private let ONE_B: Int = 1 + private let DEFAULT_MAXREADERS: UInt32 = 126 + private let DEFAULT_MAXDBS: UInt32 = 10 + + + // Keeps track of the original document for client code var dictionary: HNSWDictionary = [:] + + // typicalNeighbourhoodSize = 20 is a standard benchmark - init(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, scalar: Scalar.Type = Double.self, typicalNeighborhoodSize: Int = 20, namespace: String = "hnsw", in transaction: Transaction) throws { - documentEncoder = ContextFreeEncoder(source: encoding) - - encodedDocuments = try DeterministicDurableVectorIndex<Scalar>( - namespace: namespace, - typicalNeighborhoodSize: typicalNeighborhoodSize, - in: transaction - ) - } +// init(encoding: ContextFreeEncoder<Encoder.Scalar>.PreComputedEmbeddings, typicalNeighborhoodSize: Int = 20, namespace: String = "hnsw", in transaction: Transaction) throws { +// documentEncoder = ContextFreeEncoder<Encoder.Scalar>(source: encoding) as! Encoder +// +// index = try DeterministicDurableVectorIndex<Encoder.Scalar>( +// namespace: namespace, +// typicalNeighborhoodSize: typicalNeighborhoodSize, +// in: transaction +// ) +// } - init(encoder: any SNLPEncoder<Scalar>, scalar: Scalar.Type = Double.self, typicalNeighborhoodSize: Int = 20, namespace: String = "hnsw", in transaction: Transaction) throws { + init(encoder: Encoder = Encoder(), typicalNeighborhoodSize: Int = 20, namespace: String = "hnsw", in transaction: Transaction) throws { documentEncoder = encoder - encodedDocuments = try DeterministicDurableVectorIndex<Scalar>( + index = try DeterministicDurableVectorIndex<Encoder.Scalar>( namespace: namespace, typicalNeighborhoodSize: typicalNeighborhoodSize, in: transaction ) } + @inlinable - func addUntokenizedDocument(_ document: String, in transaction: Transaction) throws { + func addUntokenizedDocument(_ document: Item, in transaction: Transaction) throws { /// forced unwrap as! [Scalar] is needed when we use SNLPEncoder but not ContextFreeEncoder - /// encodedDocuments.insert will insert and return the corresponding key (id)s - let encodedVector = documentEncoder.encodeSentence(document) - let key = try encodedDocuments.insert(encodedVector, in: transaction) + /// encodedDocuments.insert will insert and return the corresponding key (id)s + + documents.append(document) + encodedDocuments.append(documentEncoder.encodeSentence(document.fullText)) + + assert( documents.count == encodedDocuments.count ) + + let encodedVector = documentEncoder.encodeSentence(document.fullText) + let key = try index.insert(encodedVector, in: transaction) addDocumentVectorPair( at: key, - document: document, + document: document.fullText, vector: encodedVector ) } + + func searchFor(_ query: String) -> [Item] { + return [] + } + +// func searchFor(_ query: String, in transaction: Transaction) -> [Item] { +// let queryVector = documentEncoder.encodeToken(query) +// let results = try! index.find(near: queryVector, limit: 8, in: transaction) +// +// return results.map{ documents[$0.id] } +// return [] +// } + } #endif diff --git a/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus.swift b/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus.swift index aadeee14cddd13236028c923ef0de0f8aea57a82..706ddb7e9286ca254b52b4898502c45a5015fa72 100644 --- a/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus.swift +++ b/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus.swift @@ -34,7 +34,7 @@ import Foundation // MARK: Allow EphemeralHNSWCorpus to simply be used as HNSWCorpus typealias HNSWCorpus = EphemeralHNSWCorpus -final class EphemeralHNSWCorpus<Item: SNLPDataItem,Encoder: SNLPEncoder>: SNLPCorpus where Encoder.Scalar: Codable{ +final class EphemeralHNSWCorpus<Item: SNLPDataItem,Encoder: SNLPEncoder>: SNLPCorpus where Encoder.Scalar: Codable { public typealias HNSWDictionary = [Int: DocumentVectorPair] diff --git a/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift b/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift index 541c9efbc72c745e009c8189d6bdc0d0d53fa730..96800d39dc9ba6d4fe00a8e16760e060e903a2f0 100644 --- a/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift +++ b/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift @@ -26,18 +26,19 @@ import Foundation import CoreML -//class CoreMLEncoder<Scalar: BinaryFloatingPoint>: SNLPEncoder { -// -// var zeroes: [Scalar] -// -// func encodeToken(_ token: String) -> [Scalar] { -// fatalError("CoreMLEncoder not implemented yet. Get on it.") -// } -// -// func encodeSentence(_ sentence: String) -> [Scalar] { -// fatalError("CoreMLEncoder not implemented yet. Get on it.") -// } -//} +struct CoreMLEncoder<Scalar: BinaryFloatingPoint>: SNLPEncoder { + + var zeroes: [Scalar] = [] + var dimensions: UInt = 0 + + func encodeToken(_ token: String) -> [Scalar] { + fatalError("CoreMLEncoder not implemented yet. Get on it.") + } + + func encodeSentence(_ sentence: String) -> [Scalar] { + fatalError("CoreMLEncoder not implemented yet. Get on it.") + } +} //@available(macOS 13.0, *) //public class MiniLMEmbeddings { diff --git a/Sources/SwiftNLP/2. Encoding/NatualLanguageEncoder.swift b/Sources/SwiftNLP/2. Encoding/NatualLanguageEncoder.swift index 08a3be651ee258883b68ce94500e1b9d5fa45db1..98d32eaa199fa2e89db5d8ea02747358b9fe1391 100644 --- a/Sources/SwiftNLP/2. Encoding/NatualLanguageEncoder.swift +++ b/Sources/SwiftNLP/2. Encoding/NatualLanguageEncoder.swift @@ -25,7 +25,7 @@ import Foundation import NaturalLanguage -struct NaturalLanguageEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder { +struct NaturalLanguageEncoder<Scalar: BinaryFloatingPoint>: SNLPEncoder { var dimensions: UInt = 512 var zeroes: [Scalar] { Array(repeating: Scalar(0), count: Int(dimensions)) } diff --git a/Sources/SwiftNLP/2. Encoding/OpenAIEncoder.swift b/Sources/SwiftNLP/2. Encoding/OpenAIEncoder.swift index e2dd9b4621b55004267d6b3eca9a41e1a37c5816..ff3ca6ad146dc47031940f620ffe797c836adeda 100644 --- a/Sources/SwiftNLP/2. Encoding/OpenAIEncoder.swift +++ b/Sources/SwiftNLP/2. Encoding/OpenAIEncoder.swift @@ -23,18 +23,22 @@ import Foundation -//class OpenAIEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPAsyncEncoder { -// -// -// var zeroes: [Scalar] -// -// func fetchEncodingForToken(_ token: String) async throws -> [Scalar] { -// fatalError("OpenAIEncoder not implemented. Get on it.") -// } -// -// func fetchEncodingForSentence(_ sentence: String) async throws -> [Scalar] { -// fatalError("OpenAIEncoder not implemented. Get on it.") -// } -// -// -//} +struct OpenAIEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPAsyncEncoder { + + var zeroes: [Scalar] + var dimensions: UInt + + init() { + fatalError() + } + + func fetchEncodingForToken(_ token: String) async throws -> [Scalar] { + fatalError("OpenAIEncoder not implemented. Get on it.") + } + + func fetchEncodingForSentence(_ sentence: String) async throws -> [Scalar] { + fatalError("OpenAIEncoder not implemented. Get on it.") + } + + +} diff --git a/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift b/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift index 579e136826530d550ebc735bdbe0f9c50e3ab189..358e0f9e8bd68fb1ff8ace55215d2a905bf8c503 100644 --- a/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift +++ b/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift @@ -1,234 +1,230 @@ -//#if os(macOS) -//import XCTest -//import Foundation -//import CoreLMDB -//import System -//@testable import SwiftNLP -// -//// MARK: These tests are not to be included within the pipeline -// -//final class DurableHNSWCorpusTests: XCTestCase { -// /// This is used to skip these tests in the GitLab pipeline -// override class var defaultTestSuite: XCTestSuite { -// if ProcessInfo.processInfo.environment["SKIP_TESTS"] == "DurableHNSWCorpusTests" { -// return XCTestSuite(name: "Empty") -// } -// return super.defaultTestSuite -// } -// -// /// Setting up constants for environment -// private let ONE_GB: Int = 1_073_741_824 -// private let ONE_MB: Int = 1_048_576 -// private let ONE_KB: Int = 1_024 -// private let ONE_B: Int = 1 -// private let DEFAULT_MAXREADERS: UInt32 = 126 -// private let DEFAULT_MAXDBS: UInt32 = 10 -// -// /// Setting up working directory -// private var workingDirectoryPath: FilePath! -// -// override func setUpWithError() throws { -// try super.setUpWithError() -// +#if os(macOS) +import XCTest +import Foundation +import CoreLMDB +import System +@testable import SwiftNLP + +// MARK: These tests are not to be included within the pipeline + +final class DurableHNSWCorpusTests: XCTestCase { + /// This is used to skip these tests in the GitLab pipeline + override class var defaultTestSuite: XCTestSuite { + if ProcessInfo.processInfo.environment["SKIP_TESTS"] == "DurableHNSWCorpusTests" { + return XCTestSuite(name: "Empty") + } + return super.defaultTestSuite + } + + /// Setting up constants for environment + private let ONE_GB: Int = 1_073_741_824 + private let ONE_MB: Int = 1_048_576 + private let ONE_KB: Int = 1_024 + private let ONE_B: Int = 1 + private let DEFAULT_MAXREADERS: UInt32 = 126 + private let DEFAULT_MAXDBS: UInt32 = 10 + + /// Setting up working directory + private var workingDirectoryPath: FilePath! + + override func setUpWithError() throws { + try super.setUpWithError() + + let fileManager = FileManager.default + let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb") + try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil) + workingDirectoryPath = FilePath(directoryURL.path) + + /// This commented out code alternatively works in the XCode bundle resource environment +// guard let resourcesPath = Bundle.module.resourcePath else { fatalError("Failed to find resource path.") } +// let resourcesDirectoryURL = URL(fileURLWithPath: resourcesPath).appendingPathComponent("lmdb") // let fileManager = FileManager.default -// let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb") -// try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil) -// workingDirectoryPath = FilePath(directoryURL.path) -// -// /// This commented out code alternatively works in the XCode bundle resource environment -//// guard let resourcesPath = Bundle.module.resourcePath else { fatalError("Failed to find resource path.") } -//// let resourcesDirectoryURL = URL(fileURLWithPath: resourcesPath).appendingPathComponent("lmdb") -//// let fileManager = FileManager.default -//// try fileManager.createDirectory(at: resourcesDirectoryURL, withIntermediateDirectories: true, attributes: nil) -//// print("Resources directory: \(resourcesDirectoryURL)") -//// workingDirectoryPath = FilePath(resourcesDirectoryURL.path) -// } -// -// func testBuildBasicCorpus() throws { -// let docs = [ -// "CNTK formerly known as Computational Network Toolkit", -// "is a free easy-to-use open-source commercial-grade toolkit", -// "that enable us to train deep learning algorithms to learn like the human brain." -// ] -// -// /// Setting up the environment -// let env = try Environment() -// try env.setMapSize(ONE_GB) -// try env.setMaxReaders(DEFAULT_MAXREADERS) -// try env.setMaxDBs(DEFAULT_MAXDBS) -// try env.open(path: workingDirectoryPath) -// -// /// Writing to LMDB -// let transaction = try Transaction.begin(.write, in: env) -// -// let corpus = try DurableHNSWCorpus( -// encoding: .glove6B50d, -// namespace: "testBasicExample", -// in: transaction -// ) -// -// for doc in docs { -// try corpus.addUntokenizedDocument(doc, in: transaction) -// } -// -// try transaction.commit() -// -// /// Reading from LMDB -// let readTransaction = try Transaction.begin(.read, in: env) -// -// let _ = try DurableHNSWCorpus( -// encoding: .glove6B50d, -// namespace: "testBasicExample", -// in: readTransaction -// ) -// -// readTransaction.abort() -// -// // XCTAssert(readCorpus.count == 3) -// /// readCorpus.count == 3 will fail because we have not fixed the bug with setting size upon reads -// /// This is because size is only incremented when insertion is called but it is not called when read from disk! -// } -// -// func testQueryBasicCorpus() async throws { -// let docs = [ -// "The quick brown fox jumps over the lazy dog", -// "I enjoy taking long walks along the beach at sunset", -// "Advances in neural networks have enabled new AI capabilities", -// "The stock market experienced a significant downturn last week", -// "Cooking a good meal can be both an art and a science", -// "The exploration of space is both challenging and rewarding", -// "Machine learning models are becoming increasingly sophisticated", -// "I love reading about history and ancient civilizations" -// ] -// -// let query = "I like to read about new technology and artificial intelligence" -// let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) -// -// /// Setting up the environment -// let env = try Environment() -// try env.setMapSize(ONE_GB) -// try env.setMaxReaders(DEFAULT_MAXREADERS) -// try env.setMaxDBs(DEFAULT_MAXDBS) -// try env.open(path: workingDirectoryPath) -// -// let transaction = try Transaction.begin(.write, in: env) -// -// /// Saving the memory map to disk -// let corpus = try DurableHNSWCorpus( -// encoder: documentEncoder, -// namespace: "testBasicQueryExample", -// in: transaction -// ) -// -// for doc in docs { -// try corpus.addUntokenizedDocument(doc, in: transaction) -// } -// -// corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap") -// -// try transaction.commit() -// -// do { -// let queryVector: [Double] = documentEncoder.encodeToken(query).map { Double($0) } -// -// /// Reading the memory map (and dictionary) from disk -// let readTransaction = try Transaction.begin(.write, in: env) -// -// let readCorpus = try DurableHNSWCorpus( -// encoder: documentEncoder, -// namespace: "testBasicQueryExample", -// in: readTransaction -// ) -// -// readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") // TODO: move this to initializer? -// -// let result = try readCorpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction) -// -// for result in result { -// let key = Int(result.id.foreignKey)! -// print(readCorpus.getUntokenizedDocument(at: key)) -// } -// } catch { -// print("Error when trying corpus.encodedDocuments.find(): \(error)") -// } -// -// try transaction.commit() -// } -// -// func testBuildGuelphSubredditCorpus() async throws { -// /// Generates the LMDB durable storage to disk but runs no tests otherwise -// -// /// Setting up the environment -// let env = try Environment() -// try env.setMapSize(ONE_GB) -// try env.setMaxReaders(DEFAULT_MAXREADERS) -// try env.setMaxDBs(DEFAULT_MAXDBS) -// try env.open(path: workingDirectoryPath) -// -// /// Get subreddit data -// guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else { -// fatalError("Failed to find waterloo_submissions.zst in test bundle.") -// } -// guard let submissionsData = try? Data(contentsOf: submissionsURL) else { -// fatalError("Failed to load waterloo_submissions.zst from test bundle.") -// } -// -// let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData) -// -// let transaction = try Transaction.begin(.write, in: env) -// -// let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) -// -// let corpus = try DurableHNSWCorpus( -// encoder: documentEncoder, -// namespace: "subreddit_durable", -// in: transaction -// ) -// -// /// Add documents to corpus -// for submission in submissions { -// if let text = submission.selftext { -// try corpus.addUntokenizedDocument(text, in: transaction) -// } -// } -// -// /// Save dictionary to disk -// corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap") -// -// try transaction.commit() -// } -// -// func testQueryGuelphSubredditCorpus() async throws { -// let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) -// -// /// Setting up the environment -// let env = try Environment() -// try env.setMapSize(ONE_GB) -// try env.setMaxReaders(DEFAULT_MAXREADERS) -// try env.setMaxDBs(DEFAULT_MAXDBS) -// try env.open(path: workingDirectoryPath) -// -// /// Reading the memory map (and dictionary) from disk -// let transaction = try Transaction.begin(.read, in: env) -// -// let corpus = try DurableHNSWCorpus( -// encoder: documentEncoder, -// namespace: "subreddit_durable", -// in: transaction -// ) -// -// corpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") -// -// let query = "I love waterloo and I love the geese." -// let queryVector: [Double] = documentEncoder.encodeToken(query).map { Double($0) } -// -// let result = try corpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction) -// -// for result in result { -// let key = Int(result.id.foreignKey)! -// print(corpus.getUntokenizedDocument(at: key)) -// } -// } -//} -//#endif -// +// try fileManager.createDirectory(at: resourcesDirectoryURL, withIntermediateDirectories: true, attributes: nil) +// print("Resources directory: \(resourcesDirectoryURL)") +// workingDirectoryPath = FilePath(resourcesDirectoryURL.path) + } + + func testBuildBasicCorpus() throws { + let docs = [ + "CNTK formerly known as Computational Network Toolkit", + "is a free easy-to-use open-source commercial-grade toolkit", + "that enable us to train deep learning algorithms to learn like the human brain." + ] + + /// Setting up the environment + let env = try Environment() + try env.setMapSize(ONE_GB) + try env.setMaxReaders(DEFAULT_MAXREADERS) + try env.setMaxDBs(DEFAULT_MAXDBS) + try env.open(path: workingDirectoryPath) + + /// Writing to LMDB + let transaction = try Transaction.begin(.write, in: env) + + let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>( + namespace: "testBasicExample", + in: transaction + ) + + for doc in docs { + try corpus.addUntokenizedDocument(doc, in: transaction) + } + + try transaction.commit() + + /// Reading from LMDB + let readTransaction = try Transaction.begin(.read, in: env) + + let _ = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>( + namespace: "testBasicExample", + in: readTransaction + ) + + readTransaction.abort() + + // XCTAssert(readCorpus.count == 3) + /// readCorpus.count == 3 will fail because we have not fixed the bug with setting size upon reads + /// This is because size is only incremented when insertion is called but it is not called when read from disk! + } + + func testQueryBasicCorpus() async throws { + let docs = [ + "The quick brown fox jumps over the lazy dog", + "I enjoy taking long walks along the beach at sunset", + "Advances in neural networks have enabled new AI capabilities", + "The stock market experienced a significant downturn last week", + "Cooking a good meal can be both an art and a science", + "The exploration of space is both challenging and rewarding", + "Machine learning models are becoming increasingly sophisticated", + "I love reading about history and ancient civilizations" + ] + + let query = "I like to read about new technology and artificial intelligence" + //let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) + + /// Setting up the environment + let env = try Environment() + try env.setMapSize(ONE_GB) + try env.setMaxReaders(DEFAULT_MAXREADERS) + try env.setMaxDBs(DEFAULT_MAXDBS) + try env.open(path: workingDirectoryPath) + + let transaction = try Transaction.begin(.write, in: env) + + /// Saving the memory map to disk + let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>( + namespace: "testBasicQueryExample", + in: transaction + ) + + for doc in docs { + try corpus.addUntokenizedDocument(doc, in: transaction) + } + + corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap") + + try transaction.commit() + + do { + let queryVector: [Double] = corpus.documentEncoder.encodeToken(query).map { Double($0) } + + /// Reading the memory map (and dictionary) from disk + let readTransaction = try Transaction.begin(.write, in: env) + + let readCorpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>( + namespace: "testBasicQueryExample", + in: readTransaction + ) + + readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") // TODO: move this to initializer? + + let result = try readCorpus.index.find(near: queryVector, limit: 8, in: transaction) + + for result in result { + let key = Int(result.id.foreignKey)! + print(readCorpus.getUntokenizedDocument(at: key)) + } + } catch { + print("Error when trying corpus.encodedDocuments.find(): \(error)") + } + + try transaction.commit() + } + + func testBuildGuelphSubredditCorpus() async throws { + /// Generates the LMDB durable storage to disk but runs no tests otherwise + + /// Setting up the environment + let env = try Environment() + try env.setMapSize(ONE_GB) + try env.setMaxReaders(DEFAULT_MAXREADERS) + try env.setMaxDBs(DEFAULT_MAXDBS) + try env.open(path: workingDirectoryPath) + + /// Get subreddit data + guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else { + fatalError("Failed to find waterloo_submissions.zst in test bundle.") + } + guard let submissionsData = try? Data(contentsOf: submissionsURL) else { + fatalError("Failed to load waterloo_submissions.zst from test bundle.") + } + + let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData) + + let transaction = try Transaction.begin(.write, in: env) + + let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) + + let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>( + encoder: documentEncoder, + namespace: "subreddit_durable", + in: transaction + ) + + /// Add documents to corpus + for submission in submissions { + if let text = submission.selftext { + try corpus.addUntokenizedDocument(text, in: transaction) + } + } + + /// Save dictionary to disk + corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap") + + try transaction.commit() + } + + func testQueryGuelphSubredditCorpus() async throws { + let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) + + /// Setting up the environment + let env = try Environment() + try env.setMapSize(ONE_GB) + try env.setMaxReaders(DEFAULT_MAXREADERS) + try env.setMaxDBs(DEFAULT_MAXDBS) + try env.open(path: workingDirectoryPath) + + /// Reading the memory map (and dictionary) from disk + let transaction = try Transaction.begin(.read, in: env) + + let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>( + encoder: documentEncoder, + namespace: "subreddit_durable", + in: transaction + ) + + corpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") + + let query = "I love waterloo and I love the geese." + let queryVector: [Double] = documentEncoder.encodeToken(query).map { Double($0) } + + let result = try corpus.index.find(near: queryVector, limit: 8, in: transaction) + + for result in result { + let key = Int(result.id.foreignKey)! + print(corpus.getUntokenizedDocument(at: key)) + } + } +} +#endif + diff --git a/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift b/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift index a198da5d9b710893700140decda1834014062379..c0ac5382bd39d8f132864b527d1fe414eaf9a076 100644 --- a/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift +++ b/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift @@ -165,7 +165,7 @@ final class EphemeralHNSWCorpusTests: XCTestCase { let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData) //let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) - let corpus = HNSWCorpus<String,ContextFreeEncoder<Double>>() + let corpus = HNSWCorpus<String,ContextFreeEncoder<Double>>(typicalNeighborhoodSize: 10) for submission in submissions { if let text = submission.selftext {