From ce70016d7da2518187cf8dc2267b0f97f4acbee4 Mon Sep 17 00:00:00 2001
From: Mingchung Xia <mingchung.xia@gmail.com>
Date: Mon, 25 Mar 2024 22:41:09 -0400
Subject: [PATCH] Code cleanup and documentation

---
 .../EphemeralHNSWCorpus + Codable.swift       |  82 ++---
 .../EphemeralHNSWCorpus.swift                 |  19 +-
 ...nisticEphemeralVectorIndex + Codable.swift | 100 +++---
 .../HNSW/HNSWCorpusDataHandler.swift          | 298 +++++++++---------
 .../HNSW/RNG/MersenneTwisterRNG.swift         |   4 +
 .../HNSW/DurableHNSWCorpusTests.swift         |   8 +
 .../HNSW/EphemeralHNSWCorpusTests.swift       |   3 -
 7 files changed, 264 insertions(+), 250 deletions(-)

diff --git a/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus + Codable.swift b/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus + Codable.swift
index a4a9b3c0..a7189dc6 100644
--- a/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus + Codable.swift	
+++ b/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus + Codable.swift	
@@ -1,44 +1,44 @@
-// Copyright (c) 2024 Jim Wallace
+//// Copyright (c) 2024 Jim Wallace
+////
+//// Permission is hereby granted, free of charge, to any person
+//// obtaining a copy of this software and associated documentation
+//// files (the "Software"), to deal in the Software without
+//// restriction, including without limitation the rights to use,
+//// copy, modify, merge, publish, distribute, sublicense, and/or sell
+//// copies of the Software, and to permit persons to whom the
+//// Software is furnished to do so, subject to the following
+//// conditions:
+////
+//// The above copyright notice and this permission notice shall be
+//// included in all copies or substantial portions of the Software.
+////
+//// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+//// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+//// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+//// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+//// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+//// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+//// OTHER DEALINGS IN THE SOFTWARE.
+////
+//// Created by Mingchung Xia on 2024-02-07.
+////
 //
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
+//import Foundation
 //
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
+//// MARK: Decodable conformance is in HNSWCorpus
 //
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
-//
-// Created by Mingchung Xia on 2024-02-07.
-//
-
-import Foundation
-
-// MARK: Decodable conformance is in HNSWCorpus
-
-extension EphemeralHNSWCorpus: Codable {
-    enum CodingKeys: String, CodingKey {
-        case _documentEncoder
-        case encodedDocuments
-        case dictionary
-    }
-    
-    func encode(to encoder: Encoder) throws {
-        var container = encoder.container(keyedBy: CodingKeys.self)
-        try container.encode(_documentEncoder, forKey: ._documentEncoder)
-        try container.encode(encodedDocuments, forKey: .encodedDocuments)
-        try container.encode(dictionary, forKey: .dictionary)
-    }
-}
+//extension EphemeralHNSWCorpus: Codable {
+//    enum CodingKeys: String, CodingKey {
+//        case _documentEncoder
+//        case encodedDocuments
+//        case dictionary
+//    }
+//    
+//    func encode(to encoder: Encoder) throws {
+//        var container = encoder.container(keyedBy: CodingKeys.self)
+//        try container.encode(_documentEncoder, forKey: ._documentEncoder)
+//        try container.encode(encodedDocuments, forKey: .encodedDocuments)
+//        try container.encode(dictionary, forKey: .dictionary)
+//    }
+//}
diff --git a/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus.swift b/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus.swift
index 99829032..acbc6e70 100644
--- a/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus.swift	
+++ b/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus.swift	
@@ -21,6 +21,11 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 //
+// The HNSW work is based on the original work of Jaden Geller
+// See the https://github.com/JadenGeller/similarity-topology.git
+// for reference. The code is used with permission from the author
+// under the MIT License.
+//
 // Created by Mingchung Xia on 2024-02-14.
 //
 
@@ -54,13 +59,13 @@ final class EphemeralHNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorp
         encodedDocuments = DeterministicEphemeralVectorIndex<[Scalar]>(typicalNeighborhoodSize: typicalNeighborhoodSize)
     }
     
-     // Decodable conformance
-    required init(from decoder: Decoder) throws {
-        let container = try decoder.container(keyedBy: CodingKeys.self)
-        _documentEncoder = try container.decode(ContextFreeEncoder<Scalar>.self, forKey: ._documentEncoder)
-        encodedDocuments = try container.decode(DeterministicEphemeralVectorIndex<[Scalar]>.self, forKey: .encodedDocuments)
-        dictionary = try container.decode(HNSWDictionary.self, forKey: .dictionary)
-    }
+//     // Decodable conformance
+//    required init(from decoder: Decoder) throws {
+//        let container = try decoder.container(keyedBy: CodingKeys.self)
+//        _documentEncoder = try container.decode(ContextFreeEncoder<Scalar>.self, forKey: ._documentEncoder)
+//        encodedDocuments = try container.decode(DeterministicEphemeralVectorIndex<[Scalar]>.self, forKey: .encodedDocuments)
+//        dictionary = try container.decode(HNSWDictionary.self, forKey: .dictionary)
+//    }
     
     @inlinable
     func addUntokenizedDocument(_ document: String) {
diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicEphemeralVectorIndex + Codable.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicEphemeralVectorIndex + Codable.swift
index 84459298..1a1d2eaf 100644
--- a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicEphemeralVectorIndex + Codable.swift	
+++ b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicEphemeralVectorIndex + Codable.swift	
@@ -1,55 +1,55 @@
-// Copyright (c) 2024 Jim Wallace
+//// Copyright (c) 2024 Jim Wallace
+////
+//// Permission is hereby granted, free of charge, to any person
+//// obtaining a copy of this software and associated documentation
+//// files (the "Software"), to deal in the Software without
+//// restriction, including without limitation the rights to use,
+//// copy, modify, merge, publish, distribute, sublicense, and/or sell
+//// copies of the Software, and to permit persons to whom the
+//// Software is furnished to do so, subject to the following
+//// conditions:
+////
+//// The above copyright notice and this permission notice shall be
+//// included in all copies or substantial portions of the Software.
+////
+//// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+//// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+//// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+//// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+//// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+//// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+//// OTHER DEALINGS IN THE SOFTWARE.
+////
+//// Created by Mingchung Xia on 2024-02-07.
+////
 //
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
+//import Foundation
 //
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
+//extension DeterministicEphemeralVectorIndex: Encodable where Vector: Encodable {
+//    enum CodingKeys: String, CodingKey {
+//        case typicalNeighborhoodSize
+//        case vectors
+//    }
 //
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
+//    public func encode(to encoder: Encoder) throws {
+//        var container = encoder.container(keyedBy: CodingKeys.self)
+//        try container.encode(typicalNeighborhoodSize, forKey: .typicalNeighborhoodSize)
+//        try container.encode(base.vectors, forKey: .vectors)
+//    }
+//}
+//
+//extension DeterministicEphemeralVectorIndex: Decodable where Vector: Decodable {
+//    public init(from decoder: Decoder) throws {
+//        let container = try decoder.container(keyedBy: CodingKeys.self)
+//        let typicalNeighborhoodSize = try container.decode(Int.self, forKey: .typicalNeighborhoodSize)
+//        let vectors = try container.decode([Vector].self, forKey: .vectors)
+//
+//        self.init(typicalNeighborhoodSize: typicalNeighborhoodSize)
+//        for vector in vectors {
+//            self.insert(vector)
+//        }
+//    }
+//}
 //
-// Created by Mingchung Xia on 2024-02-07.
 //
-
-import Foundation
-
-extension DeterministicEphemeralVectorIndex: Encodable where Vector: Encodable {
-    enum CodingKeys: String, CodingKey {
-        case typicalNeighborhoodSize
-        case vectors
-    }
-
-    public func encode(to encoder: Encoder) throws {
-        var container = encoder.container(keyedBy: CodingKeys.self)
-        try container.encode(typicalNeighborhoodSize, forKey: .typicalNeighborhoodSize)
-        try container.encode(base.vectors, forKey: .vectors)
-    }
-}
-
-extension DeterministicEphemeralVectorIndex: Decodable where Vector: Decodable {
-    public init(from decoder: Decoder) throws {
-        let container = try decoder.container(keyedBy: CodingKeys.self)
-        let typicalNeighborhoodSize = try container.decode(Int.self, forKey: .typicalNeighborhoodSize)
-        let vectors = try container.decode([Vector].self, forKey: .vectors)
-
-        self.init(typicalNeighborhoodSize: typicalNeighborhoodSize)
-        for vector in vectors {
-            self.insert(vector)
-        }
-    }
-}
-
-
diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/HNSWCorpusDataHandler.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/HNSWCorpusDataHandler.swift
index 915e254a..9348babd 100644
--- a/Sources/SwiftNLP/1. Data Collection/HNSW/HNSWCorpusDataHandler.swift	
+++ b/Sources/SwiftNLP/1. Data Collection/HNSW/HNSWCorpusDataHandler.swift	
@@ -1,154 +1,154 @@
-// Copyright (c) 2024 Jim Wallace
+//// Copyright (c) 2024 Jim Wallace
+////
+//// Permission is hereby granted, free of charge, to any person
+//// obtaining a copy of this software and associated documentation
+//// files (the "Software"), to deal in the Software without
+//// restriction, including without limitation the rights to use,
+//// copy, modify, merge, publish, distribute, sublicense, and/or sell
+//// copies of the Software, and to permit persons to whom the
+//// Software is furnished to do so, subject to the following
+//// conditions:
+////
+//// The above copyright notice and this permission notice shall be
+//// included in all copies or substantial portions of the Software.
+////
+//// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+//// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+//// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+//// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+//// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+//// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+//// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+//// OTHER DEALINGS IN THE SOFTWARE.
+////
+//// Created by Mingchung Xia on 2024-02-13.
+////
 //
-// Permission is hereby granted, free of charge, to any person
-// obtaining a copy of this software and associated documentation
-// files (the "Software"), to deal in the Software without
-// restriction, including without limitation the rights to use,
-// copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the
-// Software is furnished to do so, subject to the following
-// conditions:
+//// MARK: This is outdated since we now have the presence of a DurableHNSWCorpus but still available for reference
 //
-// The above copyright notice and this permission notice shall be
-// included in all copies or substantial portions of the Software.
+//import Foundation
 //
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
-// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
-// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
-// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-// OTHER DEALINGS IN THE SOFTWARE.
+//final class HNSWCorpusDataHandler<Scalar: BinaryFloatingPoint & Codable> {
+//    var corpus: HNSWCorpus<Scalar>
+//    private var url: URL?
+//    
+//    init(corpus: HNSWCorpus<Scalar>, resource: String = "hnsw") {
+//        self.corpus = corpus
+////        self.url = Bundle.module.url(forResource: resource, withExtension: "mmap")
+//        if let downloadsDirectory = FileManager.default.urls(for: .downloadsDirectory, in: .userDomainMask).first {
+//            self.url = downloadsDirectory.appendingPathComponent(resource + ".mmap")
+//        }
+//    }
+//    
+//    /// It is very difficult to get the exact size of the corpus as every class also depends on other classes
+//    /// The size of the memory map may not even be correct if it only stores the vectors, and the vectors are really the only "important" part
+//    func getCorpusSize() -> Int {
+////        return heapSize(corpus)
+////        return class_getInstanceSize(type(of: corpus))
+////        return MemoryLayout.size(ofValue: corpus)
+//        var size = 0
+//        let data = corpus.encodedDocuments.base.vectors
+//        for vector in data {
+//            size += MemoryLayout.size(ofValue: vector)
+//        }
+//        return size
+//    }
+//    
+//    func getDictionarySize(includeKey: Bool = true) -> Int {
+//        var size = 0
+//        let data = corpus.getDictionary()
+//        for (key, documentVectorPair) in data {
+//            if includeKey { size += MemoryLayout.size(ofValue: key) }
+//            size += MemoryLayout.size(ofValue: documentVectorPair.untokenizedDocument)
+//            size += MemoryLayout.size(ofValue: documentVectorPair.vector)
+//        }
+//        return size
+//    }
+//    
+//    private func heapSize(_ obj: AnyObject) -> Int {
+//        return malloc_size(Unmanaged.passUnretained(obj).toOpaque())
+//    }
+//}
 //
-// Created by Mingchung Xia on 2024-02-13.
-//
-
-// MARK: This is outdated since we now have the presence of a DurableHNSWCorpus but still available for reference
-
-import Foundation
-
-final class HNSWCorpusDataHandler<Scalar: BinaryFloatingPoint & Codable> {
-    var corpus: HNSWCorpus<Scalar>
-    private var url: URL?
-    
-    init(corpus: HNSWCorpus<Scalar>, resource: String = "hnsw") {
-        self.corpus = corpus
-//        self.url = Bundle.module.url(forResource: resource, withExtension: "mmap")
-        if let downloadsDirectory = FileManager.default.urls(for: .downloadsDirectory, in: .userDomainMask).first {
-            self.url = downloadsDirectory.appendingPathComponent(resource + ".mmap")
-        }
-    }
-    
-    /// It is very difficult to get the exact size of the corpus as every class also depends on other classes
-    /// The size of the memory map may not even be correct if it only stores the vectors, and the vectors are really the only "important" part
-    func getCorpusSize() -> Int {
-//        return heapSize(corpus)
-//        return class_getInstanceSize(type(of: corpus))
-//        return MemoryLayout.size(ofValue: corpus)
-        var size = 0
-        let data = corpus.encodedDocuments.base.vectors
-        for vector in data {
-            size += MemoryLayout.size(ofValue: vector)
-        }
-        return size
-    }
-    
-    func getDictionarySize(includeKey: Bool = true) -> Int {
-        var size = 0
-        let data = corpus.getDictionary()
-        for (key, documentVectorPair) in data {
-            if includeKey { size += MemoryLayout.size(ofValue: key) }
-            size += MemoryLayout.size(ofValue: documentVectorPair.untokenizedDocument)
-            size += MemoryLayout.size(ofValue: documentVectorPair.vector)
-        }
-        return size
-    }
-    
-    private func heapSize(_ obj: AnyObject) -> Int {
-        return malloc_size(Unmanaged.passUnretained(obj).toOpaque())
-    }
-}
-
-extension HNSWCorpusDataHandler {
-    func saveMemoryMap() {
-        guard let url = url else {
-            print("URL to resource not found")
-            return
-        }
-        let fileManager = FileManager.default
-        if !fileManager.fileExists(atPath: url.path) {
-            fileManager.createFile(atPath: url.path, contents: nil, attributes: nil)
-        }
-        do {
-//            let fileHandle = try FileHandle(forWritingTo: url)
+//extension HNSWCorpusDataHandler {
+//    func saveMemoryMap() {
+//        guard let url = url else {
+//            print("URL to resource not found")
+//            return
+//        }
+//        let fileManager = FileManager.default
+//        if !fileManager.fileExists(atPath: url.path) {
+//            fileManager.createFile(atPath: url.path, contents: nil, attributes: nil)
+//        }
+//        do {
+////            let fileHandle = try FileHandle(forWritingTo: url)
+////            
+////            let count = corpus.count
+////            let countData = withUnsafeBytes(of: count) { Data($0) }
+////            fileHandle.write(countData)
+////
+////            for pair in corpus {
+////                let documentData = pair.untokenizedDocument.utf8CString.withUnsafeBufferPointer { Data(buffer: $0) }
+////                fileHandle.write(documentData)
+////            }
+////            fileHandle.closeFile()
 //            
-//            let count = corpus.count
-//            let countData = withUnsafeBytes(of: count) { Data($0) }
-//            fileHandle.write(countData)
-//
-//            for pair in corpus {
-//                let documentData = pair.untokenizedDocument.utf8CString.withUnsafeBufferPointer { Data(buffer: $0) }
-//                fileHandle.write(documentData)
-//            }
-//            fileHandle.closeFile()
-            
-            print("Saving HNSW to file...")
-            /// Using the Codable conformances
-            let encoder = JSONEncoder()
-            let encoded = try encoder.encode(corpus)
-            try encoded.write(to: url)
-        } catch {
-            print("Error writing HNSW to file: \(error)")
-        }
-    }
-    
-    /// This saves only the untokenized documents dictionary map
-    func saveDictionaryMemoryMap() {
-        // TODO: Move from DurableHNSW extension once HNSW wrapper is created
-    }
-    
-    // TODO: find out how to not rebuild the index
-    static func loadMemoryMap(encoder: any SNLPEncoder, typicalNeighborhoodSize: Int = 20, resource: String = "hnsw") -> HNSWCorpus<Double> {
-        guard let url = Bundle.module.url(forResource: resource, withExtension: "mmap") else {
-            print("URL to resource not found")
-            return HNSWCorpus(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize)
-        }
-        
-        var loadedCorpus = HNSWCorpus(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize)
-        
-        do {
-//            let data = try Data(contentsOf: url, options: .alwaysMapped)
-//            let countData = data.prefix(MemoryLayout<Int>.size)
-//            let count: Int = countData.withUnsafeBytes { $0.load(as: Int.self) }
-//            var index = MemoryLayout<Int>.size
-//
-//            for _ in 0..<count {
-//                if let stringRange = data[index...].range(of: "\0".data(using: .utf8)!) {
-//                    let documentData = data[index..<stringRange.lowerBound]
-//                    if let document = String(data: documentData, encoding: .utf8) {
-//                        // Add the untokenized document to the corpus
-//                        loadedCorpus.addUntokenizedDocument(document)
-//                        index = stringRange.upperBound
-//                    }
-//                } else {
-//                    break
-//                }
-//            }
-            
-            /// Using the Codable conformances
-            print("Loading HNSW from file...")
-            let decoder = JSONDecoder()
-            let data = try Data(contentsOf: url)
-            loadedCorpus = try decoder.decode(HNSWCorpus<Double>.self, from: data)
-        } catch {
-            print("Error reading HNSW from file: \(error)")
-        }
-        return loadedCorpus
-    }
-    
-    static func loadMemoryMap(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, typicalNeighborhoodSize: Int = 20, resource: String = "hnsw") -> HNSWCorpus<Double> {
-        let encoder = ContextFreeEncoder<Scalar>(source: encoding)
-        return loadMemoryMap(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize, resource: resource)
-    }
-}
+//            print("Saving HNSW to file...")
+//            /// Using the Codable conformances
+//            let encoder = JSONEncoder()
+//            let encoded = try encoder.encode(corpus)
+//            try encoded.write(to: url)
+//        } catch {
+//            print("Error writing HNSW to file: \(error)")
+//        }
+//    }
+//    
+//    /// This saves only the untokenized documents dictionary map
+//    func saveDictionaryMemoryMap() {
+//        // TODO: Move from DurableHNSW extension once HNSW wrapper is created
+//    }
+//    
+//    // TODO: find out how to not rebuild the index
+//    static func loadMemoryMap(encoder: any SNLPEncoder, typicalNeighborhoodSize: Int = 20, resource: String = "hnsw") -> HNSWCorpus<Double> {
+//        guard let url = Bundle.module.url(forResource: resource, withExtension: "mmap") else {
+//            print("URL to resource not found")
+//            return HNSWCorpus(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize)
+//        }
+//        
+//        var loadedCorpus = HNSWCorpus(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize)
+//        
+//        do {
+////            let data = try Data(contentsOf: url, options: .alwaysMapped)
+////            let countData = data.prefix(MemoryLayout<Int>.size)
+////            let count: Int = countData.withUnsafeBytes { $0.load(as: Int.self) }
+////            var index = MemoryLayout<Int>.size
+////
+////            for _ in 0..<count {
+////                if let stringRange = data[index...].range(of: "\0".data(using: .utf8)!) {
+////                    let documentData = data[index..<stringRange.lowerBound]
+////                    if let document = String(data: documentData, encoding: .utf8) {
+////                        // Add the untokenized document to the corpus
+////                        loadedCorpus.addUntokenizedDocument(document)
+////                        index = stringRange.upperBound
+////                    }
+////                } else {
+////                    break
+////                }
+////            }
+//            
+//            /// Using the Codable conformances
+//            print("Loading HNSW from file...")
+//            let decoder = JSONDecoder()
+//            let data = try Data(contentsOf: url)
+//            loadedCorpus = try decoder.decode(HNSWCorpus<Double>.self, from: data)
+//        } catch {
+//            print("Error reading HNSW from file: \(error)")
+//        }
+//        return loadedCorpus
+//    }
+//    
+//    static func loadMemoryMap(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, typicalNeighborhoodSize: Int = 20, resource: String = "hnsw") -> HNSWCorpus<Double> {
+//        let encoder = ContextFreeEncoder<Scalar>(source: encoding)
+//        return loadMemoryMap(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize, resource: resource)
+//    }
+//}
diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/RNG/MersenneTwisterRNG.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/RNG/MersenneTwisterRNG.swift
index f96caec7..5eabd7ac 100644
--- a/Sources/SwiftNLP/1. Data Collection/HNSW/RNG/MersenneTwisterRNG.swift	
+++ b/Sources/SwiftNLP/1. Data Collection/HNSW/RNG/MersenneTwisterRNG.swift	
@@ -21,6 +21,10 @@
 // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 // OTHER DEALINGS IN THE SOFTWARE.
 //
+// See the https://github.com/JadenGeller/similarity-topology.git
+// for reference. The code is used with permission from the author
+// under the MIT License.
+//
 // Created by Mingchung Xia on 2024-01-28.
 //
 
diff --git a/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift b/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift
index 56be08db..da751bce 100644
--- a/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift	
+++ b/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift	
@@ -34,6 +34,14 @@ final class DurableHNSWCorpusTests: XCTestCase {
         let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
         try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
         workingDirectoryPath = FilePath(directoryURL.path)
+        
+        /// This commented out code alternatively works in the XCode bundle resource environment
+//        guard let resourcesPath = Bundle.module.resourcePath else { fatalError("Failed to find resource path.") }
+//        let resourcesDirectoryURL = URL(fileURLWithPath: resourcesPath).appendingPathComponent("lmdb")
+//        let fileManager = FileManager.default
+//        try fileManager.createDirectory(at: resourcesDirectoryURL, withIntermediateDirectories: true, attributes: nil)
+//        print("Resources directory: \(resourcesDirectoryURL)")
+//        workingDirectoryPath = FilePath(resourcesDirectoryURL.path)
     }
     
     func testBasicExample() throws {
diff --git a/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift b/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift
index d6b39798..13618f4e 100644
--- a/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift	
+++ b/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift	
@@ -5,9 +5,6 @@ import System
 @testable import SwiftNLP
 
 final class EphemeralHNSWCorpusTests: XCTestCase {
-    // MARK: There is also an HNSWCorpusDataHandler class which can store an EphemeralHNSWCorpus into a memory map
-    /// However, it is not recommended to use this for large datasets since it uses a currently slow coding protocol conformance with JSONEncoder/Decoder
-    
     // MARK: EphemeralHNSWCorpus can also be used as its typealias HNSWCorpus
     
     // Load a small set of documents and confirm that corpus and dictionary are updated accordingly
-- 
GitLab