Restored DurableHNSWCorpus + adjusted to new SNLPCorpus protocol

5389ad93 · Jim Wallace · 4bd282f5 · 5389ad93 · 5389ad93 · 5389ad93
Commit 5389ad93 authored 11 months ago by Jim Wallace
--- a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Dictionary.swift
+++ b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Dictionary.swift
@@ -32,9 +32,9 @@ extension DurableHNSWCorpus {
    /// This extension is used for the dictionary operations
    public struct DocumentVectorPair {
        var untokenizedDocument: String
-        var vector: [Scalar]
+        var vector: [Encoder.Scalar]
        
-        init(untokenizedDocument: String, vector: [Scalar]) {
+        init(untokenizedDocument: String, vector: [Encoder.Scalar]) {
            self.untokenizedDocument = untokenizedDocument
            self.vector = vector
        }
@@ -50,7 +50,7 @@ extension DurableHNSWCorpus {
    }
    
    @inlinable
-    func getVector(at key: Int) -> [Scalar] {
+    func getVector(at key: Int) -> [Encoder.Scalar] {
        if let pair = dictionary[key] {
            return pair.vector
        } else {
@@ -63,7 +63,7 @@ extension DurableHNSWCorpus {
        return dictionary
    }
    
-    func addDocumentVectorPair(at key: Int, document: String, vector: [Scalar]) {
+    func addDocumentVectorPair(at key: Int, document: String, vector: [Encoder.Scalar]) {
        dictionary[key] = DocumentVectorPair(
            untokenizedDocument: document,
            vector: vector

--- a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + File IO.swift
+++ b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + File IO.swift
@@ -131,12 +131,12 @@ extension DurableHNSWCorpus {
                let vectorLength = vectorLengthData.withUnsafeBytes { $0.load(as: Int.self) }
                index += MemoryLayout<Int>.size

-                var vector = [Scalar]()
+                var vector = [Encoder.Scalar]()
                for _ in 0..<vectorLength {
-                    let scalarData = data.subdata(in: index..<index+MemoryLayout<Scalar>.size)
-                    let scalar = scalarData.withUnsafeBytes { $0.load(as: Scalar.self) }
+                    let scalarData = data.subdata(in: index..<index+MemoryLayout<Encoder.Scalar>.size)
+                    let scalar = scalarData.withUnsafeBytes { $0.load(as: Encoder.Scalar.self) }
                    vector.append(scalar)
-                    index += MemoryLayout<Scalar>.size
+                    index += MemoryLayout<Encoder.Scalar>.size
                }
                
                // Add the key-value pair to the dictionary

--- a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Sequence.swift
+++ b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + Sequence.swift
-//// Copyright (c) 2024 Jim Wallace
-////
-//// Permission is hereby granted, free of charge, to any person
-//// obtaining a copy of this software and associated documentation
-//// files (the "Software"), to deal in the Software without
-//// restriction, including without limitation the rights to use,
-//// copy, modify, merge, publish, distribute, sublicense, and/or sell
-//// copies of the Software, and to permit persons to whom the
-//// Software is furnished to do so, subject to the following
-//// conditions:
-////
-//// The above copyright notice and this permission notice shall be
-//// included in all copies or substantial portions of the Software.
-////
-//// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-//// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-//// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-//// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-//// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-//// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-//// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-//// OTHER DEALINGS IN THE SOFTWARE.
-////
-//// Created by Mingchung Xia on 2024-03-16.
-////
+// Copyright (c) 2024 Jim Wallace
 //
-//#if os(macOS)
+// Permission is hereby granted, free of charge, to any person
+// obtaining a copy of this software and associated documentation
+// files (the "Software"), to deal in the Software without
+// restriction, including without limitation the rights to use,
+// copy, modify, merge, publish, distribute, sublicense, and/or sell
+// copies of the Software, and to permit persons to whom the
+// Software is furnished to do so, subject to the following
+// conditions:
 //
-//import Foundation
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
 //
-///// HNSWCorpus iterates through its dictionary of key to document vector pairs
-// 
-//extension DurableHNSWCorpus: Sequence, Collection {
-//    // Sequence Protocol Requirements
-//    @inlinable
-//    func makeIterator() -> AnyIterator<DocumentVectorPair> {
-//        var iterator = dictionary.values.makeIterator()
-//        return AnyIterator {
-//            return iterator.next()
-//        }
-//    }
-//    
-//    // Collection Protocol Requirements
-//    @inlinable
-//    var startIndex: Int {
-//        return dictionary.keys.sorted().startIndex
-//    }
-//    
-//    @inlinable
-//    var endIndex: Int {
-//        return dictionary.keys.sorted().endIndex
-//    }
-//    
-//    @inlinable
-//    subscript(position: Int) -> DocumentVectorPair {
-//        let key = dictionary.keys.sorted()[position]
-//        guard let pair = dictionary[key] else {
-//            fatalError("Key \(key) not found in HNSW dictionary")
-//        }
-//        return pair
-//    }
-//    
-//    @inlinable
-//    func index(after i: Int) -> Int {
-//        return dictionary.keys.sorted().index(after: i)
-//    }
-//}
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+// OTHER DEALINGS IN THE SOFTWARE.
 //
-//#endif
+// Created by Mingchung Xia on 2024-03-16.
+//
+
+#if os(macOS)
+
+import Foundation
+
+/// HNSWCorpus iterates through its dictionary of key to document vector pairs
+ 
+extension DurableHNSWCorpus: Sequence, Collection {
+    // Sequence Protocol Requirements
+    @inlinable
+    func makeIterator() -> AnyIterator<DocumentVectorPair> {
+        var iterator = dictionary.values.makeIterator()
+        return AnyIterator {
+            return iterator.next()
+        }
+    }
+    
+    // Collection Protocol Requirements
+    @inlinable
+    var startIndex: Int {
+        return dictionary.keys.sorted().startIndex
+    }
+    
+    @inlinable
+    var endIndex: Int {
+        return dictionary.keys.sorted().endIndex
+    }
+    
+    @inlinable
+    subscript(position: Int) -> DocumentVectorPair {
+        let key = dictionary.keys.sorted()[position]
+        guard let pair = dictionary[key] else {
+            fatalError("Key \(key) not found in HNSW dictionary")
+        }
+        return pair
+    }
+    
+    @inlinable
+    func index(after i: Int) -> Int {
+        return dictionary.keys.sorted().index(after: i)
+    }
+}
+
+#endif
--- a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift
+++ b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus.swift
@@ -34,50 +34,87 @@ import CoreLMDBCoders
 // MARK: DurableHNSWCorpus cannot conform to SNLPCorpus under its current definition
 // This is because addingUntokenizedDocuments in a DurableHNSWCorpus requires an additional parameter (transaction) and can throw

-final class DurableHNSWCorpus<Scalar: BinaryFloatingPoint & Codable & UnsafeMemoryLayoutStorableFloat> {
+final class DurableHNSWCorpus<Item: SNLPDataItem, Encoder: SNLPEncoder>: SNLPCorpus where Encoder.Scalar: Codable & UnsafeMemoryLayoutStorableFloat {
+
+    
+
    public typealias HNSWDictionary = [Int: DocumentVectorPair]
    
-    internal var documentEncoder: any SNLPEncoder<Scalar>
-    var zeroes: [Scalar] { documentEncoder.zeroes }
    
-    var encodedDocuments: DeterministicDurableVectorIndex<Scalar>
-    var count: Int { encodedDocuments.size }
+    internal var documentEncoder: Encoder
+    internal var documents = ContiguousArray<Item>()
+    internal var encodedDocuments = ContiguousArray<[Encoder.Scalar]>()
+    
+    var index: DeterministicDurableVectorIndex<Encoder.Scalar>
+    
+    
+    private let ONE_GB: Int = 1_073_741_824
+    private let ONE_MB: Int = 1_048_576
+    private let ONE_KB: Int = 1_024
+    private let ONE_B:  Int = 1
+    private let DEFAULT_MAXREADERS: UInt32 = 126
+    private let DEFAULT_MAXDBS:     UInt32 = 10
+    
+    
+    
    
    // Keeps track of the original document for client code
    var dictionary: HNSWDictionary = [:]

+    
+    
    // typicalNeighbourhoodSize = 20 is a standard benchmark
-    init(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, scalar: Scalar.Type = Double.self, typicalNeighborhoodSize: Int = 20, namespace: String = "hnsw", in transaction: Transaction) throws {
-        documentEncoder = ContextFreeEncoder(source: encoding)
-        
-        encodedDocuments = try DeterministicDurableVectorIndex<Scalar>(
-            namespace: namespace,
-            typicalNeighborhoodSize: typicalNeighborhoodSize,
-            in: transaction
-        )
-    }
+//    init(encoding: ContextFreeEncoder<Encoder.Scalar>.PreComputedEmbeddings, typicalNeighborhoodSize: Int = 20, namespace: String = "hnsw", in transaction: Transaction) throws {
+//        documentEncoder = ContextFreeEncoder<Encoder.Scalar>(source: encoding) as! Encoder
+//        
+//        index = try DeterministicDurableVectorIndex<Encoder.Scalar>(
+//            namespace: namespace,
+//            typicalNeighborhoodSize: typicalNeighborhoodSize,
+//            in: transaction
+//        )
+//    }
    
-    init(encoder: any SNLPEncoder<Scalar>, scalar: Scalar.Type = Double.self, typicalNeighborhoodSize: Int = 20, namespace: String = "hnsw", in transaction: Transaction) throws {
+    init(encoder: Encoder = Encoder(), typicalNeighborhoodSize: Int = 20, namespace: String = "hnsw", in transaction: Transaction) throws {
        documentEncoder = encoder
-        encodedDocuments = try DeterministicDurableVectorIndex<Scalar>(
+        index = try DeterministicDurableVectorIndex<Encoder.Scalar>(
            namespace: namespace,
            typicalNeighborhoodSize: typicalNeighborhoodSize,
            in: transaction
        )
    }
    
+    
    @inlinable
-    func addUntokenizedDocument(_ document: String, in transaction: Transaction) throws {
+    func addUntokenizedDocument(_ document: Item, in transaction: Transaction) throws {
        /// forced unwrap as! [Scalar] is needed when we use SNLPEncoder but not ContextFreeEncoder
-        /// encodedDocuments.insert will insert and return the corresponding key (id)s        
-        let encodedVector = documentEncoder.encodeSentence(document)
-        let key = try encodedDocuments.insert(encodedVector, in: transaction)
+        /// encodedDocuments.insert will insert and return the corresponding key (id)s
+        
+        documents.append(document)
+        encodedDocuments.append(documentEncoder.encodeSentence(document.fullText))
+        
+        assert( documents.count == encodedDocuments.count )
+        
+        let encodedVector = documentEncoder.encodeSentence(document.fullText)
+        let key = try index.insert(encodedVector, in: transaction)
        addDocumentVectorPair(
            at: key,
-            document: document,
+            document: document.fullText,
            vector: encodedVector
        )
    }
+    
+    func searchFor(_ query: String) -> [Item] {
+        return []
+    }
+    
+//    func searchFor(_ query: String, in transaction: Transaction) -> [Item] {
+//        let queryVector = documentEncoder.encodeToken(query)
+//        let results = try! index.find(near: queryVector, limit: 8, in: transaction)
+//        
+//        return results.map{ documents[$0.id] }
+//        return []
+//    }
+    
 }

 #endif
--- a/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus.swift
+++ b/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus.swift
@@ -34,7 +34,7 @@ import Foundation
 // MARK: Allow EphemeralHNSWCorpus to simply be used as HNSWCorpus
 typealias HNSWCorpus = EphemeralHNSWCorpus

-final class EphemeralHNSWCorpus<Item: SNLPDataItem,Encoder: SNLPEncoder>: SNLPCorpus where Encoder.Scalar: Codable{
+final class EphemeralHNSWCorpus<Item: SNLPDataItem,Encoder: SNLPEncoder>: SNLPCorpus where Encoder.Scalar: Codable {
        

    public typealias HNSWDictionary = [Int: DocumentVectorPair]

--- a/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift
+++ b/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift
@@ -26,18 +26,19 @@ import Foundation
 import CoreML


-//class CoreMLEncoder<Scalar: BinaryFloatingPoint>: SNLPEncoder {
-//
-//    var zeroes: [Scalar]
-//        
-//    func encodeToken(_ token: String) -> [Scalar] {
-//        fatalError("CoreMLEncoder not implemented yet. Get on it.")
-//    }
-//    
-//    func encodeSentence(_ sentence: String) -> [Scalar] {
-//        fatalError("CoreMLEncoder not implemented yet. Get on it.")
-//    }
-//}
+struct CoreMLEncoder<Scalar: BinaryFloatingPoint>: SNLPEncoder {
+    
+    var zeroes: [Scalar] = []
+    var dimensions: UInt = 0
+        
+    func encodeToken(_ token: String) -> [Scalar] {
+        fatalError("CoreMLEncoder not implemented yet. Get on it.")
+    }
+    
+    func encodeSentence(_ sentence: String) -> [Scalar] {
+        fatalError("CoreMLEncoder not implemented yet. Get on it.")
+    }
+}

 //@available(macOS 13.0, *)
 //public class MiniLMEmbeddings {

--- a/Sources/SwiftNLP/2. Encoding/NatualLanguageEncoder.swift
+++ b/Sources/SwiftNLP/2. Encoding/NatualLanguageEncoder.swift
@@ -25,7 +25,7 @@
 import Foundation
 import NaturalLanguage

-struct NaturalLanguageEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder {
+struct NaturalLanguageEncoder<Scalar: BinaryFloatingPoint>: SNLPEncoder {
    
    var dimensions: UInt = 512
    var zeroes: [Scalar] { Array(repeating: Scalar(0), count: Int(dimensions)) }

--- a/Sources/SwiftNLP/2. Encoding/OpenAIEncoder.swift
+++ b/Sources/SwiftNLP/2. Encoding/OpenAIEncoder.swift
@@ -23,18 +23,22 @@

 import Foundation

-//class OpenAIEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPAsyncEncoder {
-//
-//                    
-//    var zeroes: [Scalar]
-//    
-//    func fetchEncodingForToken(_ token: String) async throws -> [Scalar] {
-//        fatalError("OpenAIEncoder not implemented. Get on it.")
-//    }
-//    
-//    func fetchEncodingForSentence(_ sentence: String) async throws -> [Scalar] {
-//        fatalError("OpenAIEncoder not implemented. Get on it.")
-//    }
-//    
-//    
-//}
+struct OpenAIEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPAsyncEncoder {
+    
+    var zeroes: [Scalar]
+    var dimensions: UInt
+    
+    init() {
+        fatalError()
+    }
+    
+    func fetchEncodingForToken(_ token: String) async throws -> [Scalar] {
+        fatalError("OpenAIEncoder not implemented. Get on it.")
+    }
+    
+    func fetchEncodingForSentence(_ sentence: String) async throws -> [Scalar] {
+        fatalError("OpenAIEncoder not implemented. Get on it.")
+    }
+    
+    
+}
--- a/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift
+++ b/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift
-//#if os(macOS)
-//import XCTest
-//import Foundation
-//import CoreLMDB
-//import System
-//@testable import SwiftNLP
-//
-//// MARK: These tests are not to be included within the pipeline
-//
-//final class DurableHNSWCorpusTests: XCTestCase {
-//    /// This is used to skip these tests in the GitLab pipeline
-//    override class var defaultTestSuite: XCTestSuite {
-//        if ProcessInfo.processInfo.environment["SKIP_TESTS"] == "DurableHNSWCorpusTests" {
-//            return XCTestSuite(name: "Empty")
-//        }
-//        return super.defaultTestSuite
-//    }
-//    
-//    /// Setting up constants for environment
-//    private let ONE_GB: Int = 1_073_741_824
-//    private let ONE_MB: Int = 1_048_576
-//    private let ONE_KB: Int = 1_024
-//    private let ONE_B:  Int = 1
-//    private let DEFAULT_MAXREADERS: UInt32 = 126
-//    private let DEFAULT_MAXDBS:     UInt32 = 10
-//    
-//    /// Setting up working directory
-//    private var workingDirectoryPath: FilePath!
-//    
-//    override func setUpWithError() throws {
-//        try super.setUpWithError()
-//        
+#if os(macOS)
+import XCTest
+import Foundation
+import CoreLMDB
+import System
+@testable import SwiftNLP
+
+// MARK: These tests are not to be included within the pipeline
+
+final class DurableHNSWCorpusTests: XCTestCase {
+    /// This is used to skip these tests in the GitLab pipeline
+    override class var defaultTestSuite: XCTestSuite {
+        if ProcessInfo.processInfo.environment["SKIP_TESTS"] == "DurableHNSWCorpusTests" {
+            return XCTestSuite(name: "Empty")
+        }
+        return super.defaultTestSuite
+    }
+    
+    /// Setting up constants for environment
+    private let ONE_GB: Int = 1_073_741_824
+    private let ONE_MB: Int = 1_048_576
+    private let ONE_KB: Int = 1_024
+    private let ONE_B:  Int = 1
+    private let DEFAULT_MAXREADERS: UInt32 = 126
+    private let DEFAULT_MAXDBS:     UInt32 = 10
+    
+    /// Setting up working directory
+    private var workingDirectoryPath: FilePath!
+    
+    override func setUpWithError() throws {
+        try super.setUpWithError()
+        
+        let fileManager = FileManager.default
+        let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
+        try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
+        workingDirectoryPath = FilePath(directoryURL.path)
+        
+        /// This commented out code alternatively works in the XCode bundle resource environment
+//        guard let resourcesPath = Bundle.module.resourcePath else { fatalError("Failed to find resource path.") }
+//        let resourcesDirectoryURL = URL(fileURLWithPath: resourcesPath).appendingPathComponent("lmdb")
 //        let fileManager = FileManager.default
-//        let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
-//        try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
-//        workingDirectoryPath = FilePath(directoryURL.path)
-//        
-//        /// This commented out code alternatively works in the XCode bundle resource environment
-////        guard let resourcesPath = Bundle.module.resourcePath else { fatalError("Failed to find resource path.") }
-////        let resourcesDirectoryURL = URL(fileURLWithPath: resourcesPath).appendingPathComponent("lmdb")
-////        let fileManager = FileManager.default
-////        try fileManager.createDirectory(at: resourcesDirectoryURL, withIntermediateDirectories: true, attributes: nil)
-////        print("Resources directory: \(resourcesDirectoryURL)")
-////        workingDirectoryPath = FilePath(resourcesDirectoryURL.path)
-//    }
-//    
-//    func testBuildBasicCorpus() throws {
-//        let docs = [
-//            "CNTK formerly known as Computational Network Toolkit",
-//            "is a free easy-to-use open-source commercial-grade toolkit",
-//            "that enable us to train deep learning algorithms to learn like the human brain."
-//        ]
-//        
-//        /// Setting up the environment
-//        let env = try Environment()
-//        try env.setMapSize(ONE_GB)
-//        try env.setMaxReaders(DEFAULT_MAXREADERS)
-//        try env.setMaxDBs(DEFAULT_MAXDBS)
-//        try env.open(path: workingDirectoryPath)
-//        
-//        /// Writing to LMDB
-//        let transaction = try Transaction.begin(.write, in: env)
-//
-//        let corpus = try DurableHNSWCorpus(
-//            encoding: .glove6B50d,
-//            namespace: "testBasicExample",
-//            in: transaction
-//        )
-//        
-//        for doc in docs {
-//            try corpus.addUntokenizedDocument(doc, in: transaction)
-//        }
-//        
-//        try transaction.commit()
-//        
-//        /// Reading from LMDB
-//        let readTransaction = try Transaction.begin(.read, in: env)
-//        
-//        let _ = try DurableHNSWCorpus(
-//            encoding: .glove6B50d,
-//            namespace: "testBasicExample",
-//            in: readTransaction
-//        )
-//        
-//        readTransaction.abort()
-//        
-//        // XCTAssert(readCorpus.count == 3)
-//        /// readCorpus.count == 3 will fail because we have not fixed the bug with setting size upon reads
-//        /// This is because size is only incremented when insertion is called but it is not called when read from disk!
-//    }
-//    
-//    func testQueryBasicCorpus() async throws {
-//        let docs = [
-//            "The quick brown fox jumps over the lazy dog",
-//            "I enjoy taking long walks along the beach at sunset",
-//            "Advances in neural networks have enabled new AI capabilities",
-//            "The stock market experienced a significant downturn last week",
-//            "Cooking a good meal can be both an art and a science",
-//            "The exploration of space is both challenging and rewarding",
-//            "Machine learning models are becoming increasingly sophisticated",
-//            "I love reading about history and ancient civilizations"
-//        ]
-//        
-//        let query = "I like to read about new technology and artificial intelligence"
-//        let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
-//        
-//        /// Setting up the environment
-//        let env = try Environment()
-//        try env.setMapSize(ONE_GB)
-//        try env.setMaxReaders(DEFAULT_MAXREADERS)
-//        try env.setMaxDBs(DEFAULT_MAXDBS)
-//        try env.open(path: workingDirectoryPath)
-//        
-//        let transaction = try Transaction.begin(.write, in: env)
-//        
-//        /// Saving the memory map to disk
-//        let corpus = try DurableHNSWCorpus(
-//            encoder: documentEncoder,
-//            namespace: "testBasicQueryExample",
-//            in: transaction
-//        )
-//        
-//        for doc in docs {
-//            try corpus.addUntokenizedDocument(doc, in: transaction)
-//        }
-//        
-//        corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
-//        
-//        try transaction.commit()
-//        
-//        do {
-//            let queryVector: [Double] = documentEncoder.encodeToken(query).map { Double($0) }
-//            
-//            /// Reading the memory map (and dictionary) from disk
-//            let readTransaction = try Transaction.begin(.write, in: env)
-//            
-//            let readCorpus = try DurableHNSWCorpus(
-//                encoder: documentEncoder,
-//                namespace: "testBasicQueryExample",
-//                in: readTransaction
-//            )
-//            
-//            readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") // TODO: move this to initializer?
-//            
-//            let result = try readCorpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction)
-//            
-//            for result in result {
-//                let key = Int(result.id.foreignKey)!
-//                print(readCorpus.getUntokenizedDocument(at: key))
-//            }
-//        } catch {
-//            print("Error when trying corpus.encodedDocuments.find(): \(error)")
-//        }
-//        
-//        try transaction.commit()
-//    }
-//    
-//    func testBuildGuelphSubredditCorpus() async throws {
-//        /// Generates the LMDB durable storage to disk but runs no tests otherwise
-//    
-//        /// Setting up the environment
-//        let env = try Environment()
-//        try env.setMapSize(ONE_GB)
-//        try env.setMaxReaders(DEFAULT_MAXREADERS)
-//        try env.setMaxDBs(DEFAULT_MAXDBS)
-//        try env.open(path: workingDirectoryPath)
-//        
-//        /// Get subreddit data
-//        guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
-//            fatalError("Failed to find waterloo_submissions.zst in test bundle.")
-//        }
-//        guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
-//            fatalError("Failed to load waterloo_submissions.zst from test bundle.")
-//        }
-//
-//        let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
-//        
-//        let transaction = try Transaction.begin(.write, in: env)
-//        
-//        let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
-//        
-//        let corpus = try DurableHNSWCorpus(
-//            encoder: documentEncoder,
-//            namespace: "subreddit_durable",
-//            in: transaction
-//        )
-//
-//        /// Add documents to corpus
-//        for submission in submissions {
-//            if let text = submission.selftext {
-//                try corpus.addUntokenizedDocument(text, in: transaction)
-//            }
-//        }
-//
-//        /// Save dictionary to disk
-//        corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
-//        
-//        try transaction.commit()
-//    }
-//    
-//    func testQueryGuelphSubredditCorpus() async throws {
-//        let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
-//        
-//        /// Setting up the environment
-//        let env = try Environment()
-//        try env.setMapSize(ONE_GB)
-//        try env.setMaxReaders(DEFAULT_MAXREADERS)
-//        try env.setMaxDBs(DEFAULT_MAXDBS)
-//        try env.open(path: workingDirectoryPath)
-//        
-//        /// Reading the memory map (and dictionary) from disk
-//        let transaction = try Transaction.begin(.read, in: env)
-//        
-//        let corpus = try DurableHNSWCorpus(
-//            encoder: documentEncoder,
-//            namespace: "subreddit_durable",
-//            in: transaction
-//        )
-//        
-//        corpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap")
-//        
-//        let query = "I love waterloo and I love the geese."
-//        let queryVector: [Double] = documentEncoder.encodeToken(query).map { Double($0) }
-//        
-//        let result = try corpus.encodedDocuments.find(near: queryVector, limit: 8, in: transaction)
-//        
-//        for result in result {
-//            let key = Int(result.id.foreignKey)!
-//            print(corpus.getUntokenizedDocument(at: key))
-//        }
-//    }
-//}
-//#endif
-//
+//        try fileManager.createDirectory(at: resourcesDirectoryURL, withIntermediateDirectories: true, attributes: nil)
+//        print("Resources directory: \(resourcesDirectoryURL)")
+//        workingDirectoryPath = FilePath(resourcesDirectoryURL.path)
+    }
+    
+    func testBuildBasicCorpus() throws {
+        let docs = [
+            "CNTK formerly known as Computational Network Toolkit",
+            "is a free easy-to-use open-source commercial-grade toolkit",
+            "that enable us to train deep learning algorithms to learn like the human brain."
+        ]
+        
+        /// Setting up the environment
+        let env = try Environment()
+        try env.setMapSize(ONE_GB)
+        try env.setMaxReaders(DEFAULT_MAXREADERS)
+        try env.setMaxDBs(DEFAULT_MAXDBS)
+        try env.open(path: workingDirectoryPath)
+        
+        /// Writing to LMDB
+        let transaction = try Transaction.begin(.write, in: env)
+
+        let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
+            namespace: "testBasicExample",
+            in: transaction
+        )
+        
+        for doc in docs {
+            try corpus.addUntokenizedDocument(doc, in: transaction)
+        }
+        
+        try transaction.commit()
+        
+        /// Reading from LMDB
+        let readTransaction = try Transaction.begin(.read, in: env)
+        
+        let _ = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
+            namespace: "testBasicExample",
+            in: readTransaction
+        )
+        
+        readTransaction.abort()
+        
+        // XCTAssert(readCorpus.count == 3)
+        /// readCorpus.count == 3 will fail because we have not fixed the bug with setting size upon reads
+        /// This is because size is only incremented when insertion is called but it is not called when read from disk!
+    }
+    
+    func testQueryBasicCorpus() async throws {
+        let docs = [
+            "The quick brown fox jumps over the lazy dog",
+            "I enjoy taking long walks along the beach at sunset",
+            "Advances in neural networks have enabled new AI capabilities",
+            "The stock market experienced a significant downturn last week",
+            "Cooking a good meal can be both an art and a science",
+            "The exploration of space is both challenging and rewarding",
+            "Machine learning models are becoming increasingly sophisticated",
+            "I love reading about history and ancient civilizations"
+        ]
+        
+        let query = "I like to read about new technology and artificial intelligence"
+        //let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
+        
+        /// Setting up the environment
+        let env = try Environment()
+        try env.setMapSize(ONE_GB)
+        try env.setMaxReaders(DEFAULT_MAXREADERS)
+        try env.setMaxDBs(DEFAULT_MAXDBS)
+        try env.open(path: workingDirectoryPath)
+        
+        let transaction = try Transaction.begin(.write, in: env)
+        
+        /// Saving the memory map to disk
+        let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
+            namespace: "testBasicQueryExample",
+            in: transaction
+        )
+        
+        for doc in docs {
+            try corpus.addUntokenizedDocument(doc, in: transaction)
+        }
+        
+        corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
+        
+        try transaction.commit()
+        
+        do {
+            let queryVector: [Double] = corpus.documentEncoder.encodeToken(query).map { Double($0) }
+            
+            /// Reading the memory map (and dictionary) from disk
+            let readTransaction = try Transaction.begin(.write, in: env)
+            
+            let readCorpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
+                namespace: "testBasicQueryExample",
+                in: readTransaction
+            )
+            
+            readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") // TODO: move this to initializer?
+            
+            let result = try readCorpus.index.find(near: queryVector, limit: 8, in: transaction)
+            
+            for result in result {
+                let key = Int(result.id.foreignKey)!
+                print(readCorpus.getUntokenizedDocument(at: key))
+            }
+        } catch {
+            print("Error when trying corpus.encodedDocuments.find(): \(error)")
+        }
+        
+        try transaction.commit()
+    }
+    
+    func testBuildGuelphSubredditCorpus() async throws {
+        /// Generates the LMDB durable storage to disk but runs no tests otherwise
+    
+        /// Setting up the environment
+        let env = try Environment()
+        try env.setMapSize(ONE_GB)
+        try env.setMaxReaders(DEFAULT_MAXREADERS)
+        try env.setMaxDBs(DEFAULT_MAXDBS)
+        try env.open(path: workingDirectoryPath)
+        
+        /// Get subreddit data
+        guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
+            fatalError("Failed to find waterloo_submissions.zst in test bundle.")
+        }
+        guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
+            fatalError("Failed to load waterloo_submissions.zst from test bundle.")
+        }
+
+        let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
+        
+        let transaction = try Transaction.begin(.write, in: env)
+        
+        let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
+        
+        let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>(
+            encoder: documentEncoder,
+            namespace: "subreddit_durable",
+            in: transaction
+        )
+
+        /// Add documents to corpus
+        for submission in submissions {
+            if let text = submission.selftext {
+                try corpus.addUntokenizedDocument(text, in: transaction)
+            }
+        }
+
+        /// Save dictionary to disk
+        corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
+        
+        try transaction.commit()
+    }
+    
+    func testQueryGuelphSubredditCorpus() async throws {
+        let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
+        
+        /// Setting up the environment
+        let env = try Environment()
+        try env.setMapSize(ONE_GB)
+        try env.setMaxReaders(DEFAULT_MAXREADERS)
+        try env.setMaxDBs(DEFAULT_MAXDBS)
+        try env.open(path: workingDirectoryPath)
+        
+        /// Reading the memory map (and dictionary) from disk
+        let transaction = try Transaction.begin(.read, in: env)
+        
+        let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>(
+            encoder: documentEncoder,
+            namespace: "subreddit_durable",
+            in: transaction
+        )
+        
+        corpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap")
+        
+        let query = "I love waterloo and I love the geese."
+        let queryVector: [Double] = documentEncoder.encodeToken(query).map { Double($0) }
+        
+        let result = try corpus.index.find(near: queryVector, limit: 8, in: transaction)
+        
+        for result in result {
+            let key = Int(result.id.foreignKey)!
+            print(corpus.getUntokenizedDocument(at: key))
+        }
+    }
+}
+#endif
+
--- a/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift
+++ b/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift
@@ -165,7 +165,7 @@ final class EphemeralHNSWCorpusTests: XCTestCase {
        let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
        
        //let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
-        let corpus = HNSWCorpus<String,ContextFreeEncoder<Double>>()
+        let corpus = HNSWCorpus<String,ContextFreeEncoder<Double>>(typicalNeighborhoodSize: 10)
        
        for submission in submissions {
            if let text = submission.selftext {