From 22e0de05a15b10a4c218697f48fbd8df12221c63 Mon Sep 17 00:00:00 2001
From: Mingchung Xia <mingchung.xia@gmail.com>
Date: Tue, 12 Mar 2024 18:14:18 -0400
Subject: [PATCH] Durable query for single lifecycle testcase

---
 .../DurableHNSWCorpus.swift                   |  1 +
 .../DeterministicDurableVectorIndex.swift     |  2 +-
 .../SwiftNLPTests/2. Encoding/HNSWTests.swift | 81 +++++++++++++------
 3 files changed, 58 insertions(+), 26 deletions(-)

diff --git a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift
index fc76b183..e7a117ee 100644
--- a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift	
+++ b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift	
@@ -26,6 +26,7 @@ class DurableHNSWCorpus/*<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus*/ {
     // typicalNeighbourhoodSize = 20 is a standard benchmark
     init(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, scalar: Scalar.Type = Double.self, typicalNeighborhoodSize: Int = 20, namespace: String = "durablehnsw", in transaction: Transaction) throws {
         _documentEncoder = ContextFreeEncoder(source: encoding)
+        
         encodedDocuments = try DeterministicDurableVectorIndex(
             namespace: namespace,
             typicalNeighborhoodSize: typicalNeighborhoodSize,
diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift
index ffe9313d..ab2ca1e5 100644
--- a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift	
+++ b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicDurableVectorIndex.swift	
@@ -64,7 +64,7 @@ public struct DeterministicDurableVectorIndex/*<Vector: Collection & Codable> wh
     public func find(near query: Vector, limit: Int, exact: Bool = false, in transaction: Transaction) throws -> [Index.Neighbor] {
         if exact {
             // TODO: Exact search logic
-            fatalError("Exact search logic for DeterministicDurableVectorIndex is not supported")
+            fatalError("Exact search logic for DeterministicDurableVectorIndex is not currently supported")
         } else {
             let accessor = try Index.Accessor(for: base, in: transaction)
             return Array(try accessor.find(near: query, limit: limit))
diff --git a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift
index 06382a1c..46d246e3 100644
--- a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift	
+++ b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift	
@@ -31,16 +31,17 @@ final class HNSWTests: XCTestCase {
             "that enable us to train deep learning algorithms to learn like the human brain."
          ]
         
+        let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testbasicexample")
 //        var corpus = HNSWCorpus(encoding: .glove6B50d)
 //        corpus.addUntokenizedDocuments(docs)
 //        
-//        let dataHandler = HNSWCorpusDataHandler(corpus: corpus, resource: "hnsw_testbasicexample")
-//        let corpusSize = dataHandler.getCorpusSize()
+        let dataHandler = HNSWCorpusDataHandler(corpus: corpus, resource: "hnsw_testbasicexample")
+        let corpusSize = dataHandler.getCorpusSize()
 //        let dictionarySize = dataHandler.getDictionarySize(includeKey: false)
 //        print("Corpus size: \(corpusSize) bytes")
 //        print("Dictionary size: \(dictionarySize) bytes")
 //        dataHandler.saveMemoryMap()
-        let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testbasicexample")
+        
         
         XCTAssert(corpus.count == 3)
         
@@ -56,20 +57,23 @@ final class HNSWTests: XCTestCase {
             "is a free easy-to-use open-source commercial-grade toolkit",
             "that enable us to train deep learning algorithms to learn like the human brain."
         ]
-
+        
+        /// Setting up working directory
+        let fileManager = FileManager.default
+        let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
+        try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
+        let filepath = FilePath(directoryURL.path)
+        
         /// Setting up the environment
         let env = try Environment()
         try env.setMapSize(1_073_741_824) /// 1 GB
         try env.setMaxReaders(126) /// default
         try env.setMaxDBs(10)
+        try env.open(path: filepath)
         
-        let fileManager = FileManager.default
-        let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
-        try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
-        try env.open(path: FilePath(directoryURL.path))
-        
+        // Writing to disk
         let transaction = try Transaction.begin(.write, in: env)
-        
+
         var corpus = try DurableHNSWCorpus(
             encoding: .glove6B50d,
             namespace: "testbasicexampledurable",
@@ -79,14 +83,23 @@ final class HNSWTests: XCTestCase {
         for doc in docs {
             try corpus.addUntokenizedDocument(doc, in: transaction)
         }
-        
+
         try transaction.commit()
-//        let transaction = try Transaction.begin(.read, in: env)
-//        
-//        transaction.abort()
         
+        // Reading from disk
         
-        XCTAssert(corpus.count == 3)
+        let readTransaction = try Transaction.begin(.read, in: env)
+        
+        let readCorpus = try DurableHNSWCorpus(
+            encoding: .glove6B50d,
+            namespace: "testbasicexampledurable",
+            in: readTransaction
+        )
+        
+        readTransaction.abort()
+        
+//        XCTAssert(corpus.count == 3)
+        XCTAssert(readCorpus.count == 3)
     }
     
     
@@ -254,30 +267,49 @@ final class HNSWTests: XCTestCase {
         let query = "I like to read about new technology and artificial intelligence"
         let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
         
+        /// Setting up working directory
+        let fileManager = FileManager.default
+        let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
+        try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
+        let filepath = FilePath(directoryURL.path)
+        
         /// Setting up the environment
         let env = try Environment()
         try env.setMapSize(1_073_741_824) /// 1 GB
-        try env.setMaxReaders(256)
+        try env.setMaxReaders(126) /// default
         try env.setMaxDBs(10)
-        
-        let fileManager = FileManager.default
-        let directoryURL = fileManager.temporaryDirectory.appendingPathComponent("lmdb")
-        try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
-        try env.open(path: FilePath(directoryURL.path))
+        try env.open(path: filepath)
         
         let transaction = try Transaction.begin(.write, in: env)
         
-        var corpus = try DurableHNSWCorpus(encoder: _documentEncoder, namespace: "testbasicqueryexampledurable", in: transaction)
+        var corpus = try DurableHNSWCorpus(
+            encoder: _documentEncoder,
+            namespace: "testbasicqueryexampledurable",
+            in: transaction
+        )
         
         for doc in docs {
             try corpus.addUntokenizedDocument(doc, in: transaction)
         }
         
+        try transaction.commit()
+        
         do {
             print("Attempting to query corpus.encodedDocuments.find()...")
             
             let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
-            let result = try corpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction)
+            
+            let readTransaction = try Transaction.begin(.write, in: env)
+            
+            let readCorpus = try DurableHNSWCorpus(
+                encoder: _documentEncoder,
+                namespace: "testbasicqueryexampledurable",
+                in: readTransaction
+            )
+            
+            // do not add documents here!
+            
+            let result = try readCorpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction)
             
             for result in result {
                 let key = Int(result.id.foreignKey)!
@@ -288,9 +320,8 @@ final class HNSWTests: XCTestCase {
         } catch {
             print("Error when trying corpus.encodedDocuments.find(): \(error)")
         }
-        transaction.reset()
+        
         try transaction.commit()
-        try transaction.renew()
     }
     
     func testLargeQueryExample() async throws {
-- 
GitLab