diff --git a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Dictionary.swift b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Dictionary.swift index b9c22c92081a1f689ebbc6029498ad049cb7dea1..cc0711b2c6bf770251d86ce78c5541dace5cf114 100644 --- a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Dictionary.swift +++ b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Dictionary.swift @@ -1,8 +1,70 @@ // -// File.swift -// +// HNSWCorpus + Dictionary.swift // -// Created by Mingchung Xia on 2024-02-26. +// +// Created by Mingchung Xia on 2024-02-14. // import Foundation + +extension DurableHNSWCorpus { + /// This extension is used for the dictionary operations + public struct DocumentVectorPair { + var untokenizedDocument: String + var vector: [Scalar] + + init(untokenizedDocument: String, vector: [Scalar]) { + self.untokenizedDocument = untokenizedDocument + self.vector = vector + } + } + + @inlinable + func getUntokenizedDocument(at key: Int) -> String { + if let pair = dictionary[key] { + return pair.untokenizedDocument + } else { + fatalError("Key \(key) not found in HNSW dictionary") + } + } + + @inlinable + func getVector(at key: Int) -> [Scalar] { + if let pair = dictionary[key] { + return pair.vector + } else { + fatalError("Key \(key) not found in HNSW dictionary") + } + } + + @inlinable + func getDictionary() -> [Int: DocumentVectorPair] { + return dictionary + } + + func addDocumentVectorPair(at key: Int, document: String, vector: [Scalar]) { + dictionary[key] = DocumentVectorPair( + untokenizedDocument: document, + vector: vector + ) + } +} +// +//extension DurableHNSWCorpus.DocumentVectorPair: Codable where Scalar: Codable { +// enum CodingKeys: String, CodingKey { +// case untokenizedDocument +// case vector +// } +// +// internal init(from decoder: Decoder) throws { +// let container = try decoder.container(keyedBy: CodingKeys.self) +// untokenizedDocument = try container.decode(String.self, forKey: .untokenizedDocument) +// vector = try container.decode([Scalar].self, forKey: .vector) +// } +// +// internal func encode(to encoder: Encoder) throws { +// var container = encoder.container(keyedBy: CodingKeys.self) +// try container.encode(untokenizedDocument, forKey: .untokenizedDocument) +// try container.encode(vector, forKey: .vector) +// } +//} diff --git a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Sequence.swift b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Sequence.swift index b9c22c92081a1f689ebbc6029498ad049cb7dea1..efe0a1cc3274c24a93589ab464e04be1d6391861 100644 --- a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Sequence.swift +++ b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Sequence.swift @@ -1,8 +1,63 @@ +//// Copyright (c) 2024 Jim Wallace +//// +//// Permission is hereby granted, free of charge, to any person +//// obtaining a copy of this software and associated documentation +//// files (the "Software"), to deal in the Software without +//// restriction, including without limitation the rights to use, +//// copy, modify, merge, publish, distribute, sublicense, and/or sell +//// copies of the Software, and to permit persons to whom the +//// Software is furnished to do so, subject to the following +//// conditions: +//// +//// The above copyright notice and this permission notice shall be +//// included in all copies or substantial portions of the Software. +//// +//// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +//// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +//// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +//// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +//// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +//// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +//// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +//// OTHER DEALINGS IN THE SOFTWARE. // -// File.swift -// +///// It may be more useful to make the conformances based on the dictionary instead of encodedDocuments +// +//extension DurableHNSWCorpus: Sequence, Collection { +// +// typealias Element = [Scalar] +// +// // Sequence Protocol Requirements +// @inlinable +// func makeIterator() -> AnyIterator<Element> { +// var index = 0 +// return AnyIterator { +// defer { index += 1 } +// guard index < self.encodedDocuments.base.vectors.count else { return nil } +// let element = self.encodedDocuments.base.vectors[index] // consider using .find +// return element +// } +// } +// +// // Collection Protocol Requirements +// @inlinable +// var startIndex: Int { +// return encodedDocuments.base.vectors.startIndex +// } +// +// @inlinable +// var endIndex: Int { +// return encodedDocuments.base.vectors.endIndex +// } +// +// @inlinable +// subscript(position: Int) -> Element { +// return encodedDocuments.base.vectors[position] +// } +// +// @inlinable +// func index(after i: Int) -> Int { +// return encodedDocuments.base.vectors.index(after: i) +// } +//} // -// Created by Mingchung Xia on 2024-02-26. -// - -import Foundation diff --git a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift index b9c22c92081a1f689ebbc6029498ad049cb7dea1..fc76b1831d18ac5989f97470794a6c64ac5a6c12 100644 --- a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift +++ b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift @@ -1,8 +1,57 @@ // -// File.swift -// +// DurableHNSWCorpus.swift +// // // Created by Mingchung Xia on 2024-02-26. // import Foundation +import CoreLMDB +import CoreLMDBCoders + +class DurableHNSWCorpus/*<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus*/ { + public typealias Scalar = Double /// This is a placeholder to make things work easier right now + + public typealias HNSWDictionary = [Int: DocumentVectorPair] + + internal var _documentEncoder: any SNLPEncoder + var zeroes: [Scalar] { _documentEncoder.zeroes as! [Scalar] } + + var encodedDocuments: DeterministicDurableVectorIndex + var count: Int { encodedDocuments.size } + + // Keeps track of the original document for client code + var dictionary: HNSWDictionary = [:] + + // typicalNeighbourhoodSize = 20 is a standard benchmark + init(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, scalar: Scalar.Type = Double.self, typicalNeighborhoodSize: Int = 20, namespace: String = "durablehnsw", in transaction: Transaction) throws { + _documentEncoder = ContextFreeEncoder(source: encoding) + encodedDocuments = try DeterministicDurableVectorIndex( + namespace: namespace, + typicalNeighborhoodSize: typicalNeighborhoodSize, + in: transaction + ) + } + + init(encoder: any SNLPEncoder, scalar: Scalar.Type = Double.self, typicalNeighborhoodSize: Int = 20, namespace: String = "durablehnsw", in transaction: Transaction) throws { + _documentEncoder = encoder + encodedDocuments = try DeterministicDurableVectorIndex( + namespace: namespace, + typicalNeighborhoodSize: typicalNeighborhoodSize, + in: transaction + ) + } + + @inlinable + func addUntokenizedDocument(_ document: String, in transaction: Transaction) throws { + /// forced unwrap as! [Scalar] is needed when we use SNLPEncoder but not ContextFreeEncoder + /// encodedDocuments.insert will insert and return the corresponding key (id)s + let encodedVector = _documentEncoder.encodeSentence(document) as! [Scalar] + let key = try encodedDocuments.insert(encodedVector, in: transaction) + addDocumentVectorPair( + at: key, + document: document, + vector: encodedVector + ) + } +} diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift index b9c22c92081a1f689ebbc6029498ad049cb7dea1..ffe9313d1f1a8926e79b8e2fd68fdb9e5cb9e548 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift @@ -1,8 +1,82 @@ +// Copyright (c) 2024 Jim Wallace // -// File.swift -// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: // -// Created by Mingchung Xia on 2024-02-26. +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. +// +// Created by Mingchung Xia on 2024-02-26. // import Foundation +import PriorityHeapModule +import PriorityHeapAlgorithms +import SimilarityMetric +import HNSWAlgorithm +import HNSWDurable +import CoreLMDB +import CoreLMDBCoders + +// MARK: This uses the persistent DurableVectorIndex + +extension DurableVectorIndex { + public typealias Neighbor = NearbyVector<DurableVectorIndex.Accessor.CompoundKey, Metric.Vector, Metric.Similarity> +} + +public struct DeterministicDurableVectorIndex/*<Vector: Collection & Codable> where Vector.Element: BinaryFloatingPoint*/ { + public typealias Vector = [Double] + public typealias Index = DurableVectorIndex<CartesianDistanceMetric<Vector>, Vector.Element> + public var base: Index + public var typicalNeighborhoodSize: Int + public var size: Int = 0 + + private var srng = SeedableRandomNumberGenerator(seed: 1) + // private var drng = DeterministicRandomNumberGenerator(seed: 1) + + public init(namespace: String, typicalNeighborhoodSize: Int = 20, in transaction: Transaction) throws { + let metric = CartesianDistanceMetric<Vector>() + let config = Config.unstableDefault(typicalNeighborhoodSize: typicalNeighborhoodSize) + self.base = try Index( + namespace: namespace, + metric: metric, + config: config, + in: transaction + ) + self.typicalNeighborhoodSize = typicalNeighborhoodSize + } + + public func find(near query: Vector, limit: Int, exact: Bool = false, in transaction: Transaction) throws -> [Index.Neighbor] { + if exact { + // TODO: Exact search logic + fatalError("Exact search logic for DeterministicDurableVectorIndex is not supported") + } else { + let accessor = try Index.Accessor(for: base, in: transaction) + return Array(try accessor.find(near: query, limit: limit)) + } + } + + @discardableResult + public mutating func insert(_ vector: Vector, in transaction: Transaction) throws -> Int { + defer { size += 1 } + let accessor = try Index.Accessor(for: base, in: transaction) + let key = String(size) + accessor.insert(vector, forKey: key, using: &srng) + return self.size + } +} diff --git a/Sources/SwiftNLP/1. Data Collection/HNSWCorpus.swift b/Sources/SwiftNLP/1. Data Collection/HNSWCorpus.swift index 22d6cdc951d370f1d7328f8b72ac91cd0816ed6d..c75afd120ea3e535e6d71943d020f67ba49fbbdd 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSWCorpus.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSWCorpus.swift @@ -24,11 +24,7 @@ import Foundation class HNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus { - public enum MemoryDuration { - case ephemeral - case durable - } - + public typealias HNSWDictionary = [Int: DocumentVectorPair] internal var _documentEncoder: any SNLPEncoder diff --git a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift index d3696cbc06f36ae45841b666a52e9c0f3f916864..06382a1cc05b41f3cc6b8f8832a186356e3bcf40 100644 --- a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift +++ b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift @@ -1,6 +1,9 @@ #if os(macOS) import XCTest import Foundation +import CoreLMDB +import CoreLMDBCoders +import System @testable import SwiftNLP final class HNSWTests: XCTestCase { @@ -47,6 +50,46 @@ final class HNSWTests: XCTestCase { } } + func testBasicExampleDurable() throws { + let docs = [ + "CNTK formerly known as Computational Network Toolkit", + "is a free easy-to-use open-source commercial-grade toolkit", + "that enable us to train deep learning algorithms to learn like the human brain." + ] + + /// Setting up the environment + let env = try Environment() + try env.setMapSize(1_073_741_824) /// 1 GB + try env.setMaxReaders(126) /// default + try env.setMaxDBs(10) + + let fileManager = FileManager.default + let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb") + try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil) + try env.open(path: FilePath(directoryURL.path)) + + let transaction = try Transaction.begin(.write, in: env) + + var corpus = try DurableHNSWCorpus( + encoding: .glove6B50d, + namespace: "testbasicexampledurable", + in: transaction + ) + + for doc in docs { + try corpus.addUntokenizedDocument(doc, in: transaction) + } + + try transaction.commit() +// let transaction = try Transaction.begin(.read, in: env) +// +// transaction.abort() + + + XCTAssert(corpus.count == 3) + } + + // Load a bigger set of documents and confirm that func testBiggerExample() throws { @@ -196,6 +239,60 @@ final class HNSWTests: XCTestCase { } } + func testBasicQueryExampleDurable() async throws { + let docs = [ + "The quick brown fox jumps over the lazy dog", + "I enjoy taking long walks along the beach at sunset", + "Advances in neural networks have enabled new AI capabilities", + "The stock market experienced a significant downturn last week", + "Cooking a good meal can be both an art and a science", + "The exploration of space is both challenging and rewarding", + "Machine learning models are becoming increasingly sophisticated", + "I love reading about history and ancient civilizations" + ] + + let query = "I like to read about new technology and artificial intelligence" + let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) + + /// Setting up the environment + let env = try Environment() + try env.setMapSize(1_073_741_824) /// 1 GB + try env.setMaxReaders(256) + try env.setMaxDBs(10) + + let fileManager = FileManager.default + let directoryURL = fileManager.temporaryDirectory.appendingPathComponent("lmdb") + try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil) + try env.open(path: FilePath(directoryURL.path)) + + let transaction = try Transaction.begin(.write, in: env) + + var corpus = try DurableHNSWCorpus(encoder: _documentEncoder, namespace: "testbasicqueryexampledurable", in: transaction) + + for doc in docs { + try corpus.addUntokenizedDocument(doc, in: transaction) + } + + do { + print("Attempting to query corpus.encodedDocuments.find()...") + + let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) } + let result = try corpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction) + + for result in result { + let key = Int(result.id.foreignKey)! + print(corpus.getUntokenizedDocument(at: key)) + } + + print("Query completed!") + } catch { + print("Error when trying corpus.encodedDocuments.find(): \(error)") + } + transaction.reset() + try transaction.commit() + try transaction.renew() + } + func testLargeQueryExample() async throws { let docs = [