From 22e0de05a15b10a4c218697f48fbd8df12221c63 Mon Sep 17 00:00:00 2001 From: Mingchung Xia <mingchung.xia@gmail.com> Date: Tue, 12 Mar 2024 18:14:18 -0400 Subject: [PATCH] Durable query for single lifecycle testcase --- .../DurableHNSWCorpus.swift | 1 + .../DeterministicDurableVectorIndex.swift | 2 +- .../SwiftNLPTests/2. Encoding/HNSWTests.swift | 81 +++++++++++++------ 3 files changed, 58 insertions(+), 26 deletions(-) diff --git a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift index fc76b183..e7a117ee 100644 --- a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift +++ b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift @@ -26,6 +26,7 @@ class DurableHNSWCorpus/*<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus*/ { // typicalNeighbourhoodSize = 20 is a standard benchmark init(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, scalar: Scalar.Type = Double.self, typicalNeighborhoodSize: Int = 20, namespace: String = "durablehnsw", in transaction: Transaction) throws { _documentEncoder = ContextFreeEncoder(source: encoding) + encodedDocuments = try DeterministicDurableVectorIndex( namespace: namespace, typicalNeighborhoodSize: typicalNeighborhoodSize, diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift index ffe9313d..ab2ca1e5 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift @@ -64,7 +64,7 @@ public struct DeterministicDurableVectorIndex/*<Vector: Collection & Codable> wh public func find(near query: Vector, limit: Int, exact: Bool = false, in transaction: Transaction) throws -> [Index.Neighbor] { if exact { // TODO: Exact search logic - fatalError("Exact search logic for DeterministicDurableVectorIndex is not supported") + fatalError("Exact search logic for DeterministicDurableVectorIndex is not currently supported") } else { let accessor = try Index.Accessor(for: base, in: transaction) return Array(try accessor.find(near: query, limit: limit)) diff --git a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift index 06382a1c..46d246e3 100644 --- a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift +++ b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift @@ -31,16 +31,17 @@ final class HNSWTests: XCTestCase { "that enable us to train deep learning algorithms to learn like the human brain." ] + let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testbasicexample") // var corpus = HNSWCorpus(encoding: .glove6B50d) // corpus.addUntokenizedDocuments(docs) // -// let dataHandler = HNSWCorpusDataHandler(corpus: corpus, resource: "hnsw_testbasicexample") -// let corpusSize = dataHandler.getCorpusSize() + let dataHandler = HNSWCorpusDataHandler(corpus: corpus, resource: "hnsw_testbasicexample") + let corpusSize = dataHandler.getCorpusSize() // let dictionarySize = dataHandler.getDictionarySize(includeKey: false) // print("Corpus size: \(corpusSize) bytes") // print("Dictionary size: \(dictionarySize) bytes") // dataHandler.saveMemoryMap() - let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testbasicexample") + XCTAssert(corpus.count == 3) @@ -56,20 +57,23 @@ final class HNSWTests: XCTestCase { "is a free easy-to-use open-source commercial-grade toolkit", "that enable us to train deep learning algorithms to learn like the human brain." ] - + + /// Setting up working directory + let fileManager = FileManager.default + let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb") + try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil) + let filepath = FilePath(directoryURL.path) + /// Setting up the environment let env = try Environment() try env.setMapSize(1_073_741_824) /// 1 GB try env.setMaxReaders(126) /// default try env.setMaxDBs(10) + try env.open(path: filepath) - let fileManager = FileManager.default - let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb") - try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil) - try env.open(path: FilePath(directoryURL.path)) - + // Writing to disk let transaction = try Transaction.begin(.write, in: env) - + var corpus = try DurableHNSWCorpus( encoding: .glove6B50d, namespace: "testbasicexampledurable", @@ -79,14 +83,23 @@ final class HNSWTests: XCTestCase { for doc in docs { try corpus.addUntokenizedDocument(doc, in: transaction) } - + try transaction.commit() -// let transaction = try Transaction.begin(.read, in: env) -// -// transaction.abort() + // Reading from disk - XCTAssert(corpus.count == 3) + let readTransaction = try Transaction.begin(.read, in: env) + + let readCorpus = try DurableHNSWCorpus( + encoding: .glove6B50d, + namespace: "testbasicexampledurable", + in: readTransaction + ) + + readTransaction.abort() + +// XCTAssert(corpus.count == 3) + XCTAssert(readCorpus.count == 3) } @@ -254,30 +267,49 @@ final class HNSWTests: XCTestCase { let query = "I like to read about new technology and artificial intelligence" let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) + /// Setting up working directory + let fileManager = FileManager.default + let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb") + try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil) + let filepath = FilePath(directoryURL.path) + /// Setting up the environment let env = try Environment() try env.setMapSize(1_073_741_824) /// 1 GB - try env.setMaxReaders(256) + try env.setMaxReaders(126) /// default try env.setMaxDBs(10) - - let fileManager = FileManager.default - let directoryURL = fileManager.temporaryDirectory.appendingPathComponent("lmdb") - try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil) - try env.open(path: FilePath(directoryURL.path)) + try env.open(path: filepath) let transaction = try Transaction.begin(.write, in: env) - var corpus = try DurableHNSWCorpus(encoder: _documentEncoder, namespace: "testbasicqueryexampledurable", in: transaction) + var corpus = try DurableHNSWCorpus( + encoder: _documentEncoder, + namespace: "testbasicqueryexampledurable", + in: transaction + ) for doc in docs { try corpus.addUntokenizedDocument(doc, in: transaction) } + try transaction.commit() + do { print("Attempting to query corpus.encodedDocuments.find()...") let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) } - let result = try corpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction) + + let readTransaction = try Transaction.begin(.write, in: env) + + let readCorpus = try DurableHNSWCorpus( + encoder: _documentEncoder, + namespace: "testbasicqueryexampledurable", + in: readTransaction + ) + + // do not add documents here! + + let result = try readCorpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction) for result in result { let key = Int(result.id.foreignKey)! @@ -288,9 +320,8 @@ final class HNSWTests: XCTestCase { } catch { print("Error when trying corpus.encodedDocuments.find(): \(error)") } - transaction.reset() + try transaction.commit() - try transaction.renew() } func testLargeQueryExample() async throws { -- GitLab