From ce70016d7da2518187cf8dc2267b0f97f4acbee4 Mon Sep 17 00:00:00 2001 From: Mingchung Xia <mingchung.xia@gmail.com> Date: Mon, 25 Mar 2024 22:41:09 -0400 Subject: [PATCH] Code cleanup and documentation --- .../EphemeralHNSWCorpus + Codable.swift | 82 ++--- .../EphemeralHNSWCorpus.swift | 19 +- ...nisticEphemeralVectorIndex + Codable.swift | 100 +++--- .../HNSW/HNSWCorpusDataHandler.swift | 298 +++++++++--------- .../HNSW/RNG/MersenneTwisterRNG.swift | 4 + .../HNSW/DurableHNSWCorpusTests.swift | 8 + .../HNSW/EphemeralHNSWCorpusTests.swift | 3 - 7 files changed, 264 insertions(+), 250 deletions(-) diff --git a/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus + Codable.swift b/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus + Codable.swift index a4a9b3c0..a7189dc6 100644 --- a/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus + Codable.swift +++ b/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus + Codable.swift @@ -1,44 +1,44 @@ -// Copyright (c) 2024 Jim Wallace +//// Copyright (c) 2024 Jim Wallace +//// +//// Permission is hereby granted, free of charge, to any person +//// obtaining a copy of this software and associated documentation +//// files (the "Software"), to deal in the Software without +//// restriction, including without limitation the rights to use, +//// copy, modify, merge, publish, distribute, sublicense, and/or sell +//// copies of the Software, and to permit persons to whom the +//// Software is furnished to do so, subject to the following +//// conditions: +//// +//// The above copyright notice and this permission notice shall be +//// included in all copies or substantial portions of the Software. +//// +//// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +//// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +//// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +//// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +//// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +//// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +//// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +//// OTHER DEALINGS IN THE SOFTWARE. +//// +//// Created by Mingchung Xia on 2024-02-07. +//// // -// Permission is hereby granted, free of charge, to any person -// obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without -// restriction, including without limitation the rights to use, -// copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following -// conditions: +//import Foundation // -// The above copyright notice and this permission notice shall be -// included in all copies or substantial portions of the Software. +//// MARK: Decodable conformance is in HNSWCorpus // -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. -// -// Created by Mingchung Xia on 2024-02-07. -// - -import Foundation - -// MARK: Decodable conformance is in HNSWCorpus - -extension EphemeralHNSWCorpus: Codable { - enum CodingKeys: String, CodingKey { - case _documentEncoder - case encodedDocuments - case dictionary - } - - func encode(to encoder: Encoder) throws { - var container = encoder.container(keyedBy: CodingKeys.self) - try container.encode(_documentEncoder, forKey: ._documentEncoder) - try container.encode(encodedDocuments, forKey: .encodedDocuments) - try container.encode(dictionary, forKey: .dictionary) - } -} +//extension EphemeralHNSWCorpus: Codable { +// enum CodingKeys: String, CodingKey { +// case _documentEncoder +// case encodedDocuments +// case dictionary +// } +// +// func encode(to encoder: Encoder) throws { +// var container = encoder.container(keyedBy: CodingKeys.self) +// try container.encode(_documentEncoder, forKey: ._documentEncoder) +// try container.encode(encodedDocuments, forKey: .encodedDocuments) +// try container.encode(dictionary, forKey: .dictionary) +// } +//} diff --git a/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus.swift b/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus.swift index 99829032..acbc6e70 100644 --- a/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus.swift +++ b/Sources/SwiftNLP/1. Data Collection/EphemeralHNSWCorpus.swift @@ -21,6 +21,11 @@ // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR // OTHER DEALINGS IN THE SOFTWARE. // +// The HNSW work is based on the original work of Jaden Geller +// See the https://github.com/JadenGeller/similarity-topology.git +// for reference. The code is used with permission from the author +// under the MIT License. +// // Created by Mingchung Xia on 2024-02-14. // @@ -54,13 +59,13 @@ final class EphemeralHNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorp encodedDocuments = DeterministicEphemeralVectorIndex<[Scalar]>(typicalNeighborhoodSize: typicalNeighborhoodSize) } - // Decodable conformance - required init(from decoder: Decoder) throws { - let container = try decoder.container(keyedBy: CodingKeys.self) - _documentEncoder = try container.decode(ContextFreeEncoder<Scalar>.self, forKey: ._documentEncoder) - encodedDocuments = try container.decode(DeterministicEphemeralVectorIndex<[Scalar]>.self, forKey: .encodedDocuments) - dictionary = try container.decode(HNSWDictionary.self, forKey: .dictionary) - } +// // Decodable conformance +// required init(from decoder: Decoder) throws { +// let container = try decoder.container(keyedBy: CodingKeys.self) +// _documentEncoder = try container.decode(ContextFreeEncoder<Scalar>.self, forKey: ._documentEncoder) +// encodedDocuments = try container.decode(DeterministicEphemeralVectorIndex<[Scalar]>.self, forKey: .encodedDocuments) +// dictionary = try container.decode(HNSWDictionary.self, forKey: .dictionary) +// } @inlinable func addUntokenizedDocument(_ document: String) { diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicEphemeralVectorIndex + Codable.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicEphemeralVectorIndex + Codable.swift index 84459298..1a1d2eaf 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicEphemeralVectorIndex + Codable.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicEphemeralVectorIndex + Codable.swift @@ -1,55 +1,55 @@ -// Copyright (c) 2024 Jim Wallace +//// Copyright (c) 2024 Jim Wallace +//// +//// Permission is hereby granted, free of charge, to any person +//// obtaining a copy of this software and associated documentation +//// files (the "Software"), to deal in the Software without +//// restriction, including without limitation the rights to use, +//// copy, modify, merge, publish, distribute, sublicense, and/or sell +//// copies of the Software, and to permit persons to whom the +//// Software is furnished to do so, subject to the following +//// conditions: +//// +//// The above copyright notice and this permission notice shall be +//// included in all copies or substantial portions of the Software. +//// +//// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +//// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +//// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +//// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +//// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +//// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +//// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +//// OTHER DEALINGS IN THE SOFTWARE. +//// +//// Created by Mingchung Xia on 2024-02-07. +//// // -// Permission is hereby granted, free of charge, to any person -// obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without -// restriction, including without limitation the rights to use, -// copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following -// conditions: +//import Foundation // -// The above copyright notice and this permission notice shall be -// included in all copies or substantial portions of the Software. +//extension DeterministicEphemeralVectorIndex: Encodable where Vector: Encodable { +// enum CodingKeys: String, CodingKey { +// case typicalNeighborhoodSize +// case vectors +// } // -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. +// public func encode(to encoder: Encoder) throws { +// var container = encoder.container(keyedBy: CodingKeys.self) +// try container.encode(typicalNeighborhoodSize, forKey: .typicalNeighborhoodSize) +// try container.encode(base.vectors, forKey: .vectors) +// } +//} +// +//extension DeterministicEphemeralVectorIndex: Decodable where Vector: Decodable { +// public init(from decoder: Decoder) throws { +// let container = try decoder.container(keyedBy: CodingKeys.self) +// let typicalNeighborhoodSize = try container.decode(Int.self, forKey: .typicalNeighborhoodSize) +// let vectors = try container.decode([Vector].self, forKey: .vectors) +// +// self.init(typicalNeighborhoodSize: typicalNeighborhoodSize) +// for vector in vectors { +// self.insert(vector) +// } +// } +//} // -// Created by Mingchung Xia on 2024-02-07. // - -import Foundation - -extension DeterministicEphemeralVectorIndex: Encodable where Vector: Encodable { - enum CodingKeys: String, CodingKey { - case typicalNeighborhoodSize - case vectors - } - - public func encode(to encoder: Encoder) throws { - var container = encoder.container(keyedBy: CodingKeys.self) - try container.encode(typicalNeighborhoodSize, forKey: .typicalNeighborhoodSize) - try container.encode(base.vectors, forKey: .vectors) - } -} - -extension DeterministicEphemeralVectorIndex: Decodable where Vector: Decodable { - public init(from decoder: Decoder) throws { - let container = try decoder.container(keyedBy: CodingKeys.self) - let typicalNeighborhoodSize = try container.decode(Int.self, forKey: .typicalNeighborhoodSize) - let vectors = try container.decode([Vector].self, forKey: .vectors) - - self.init(typicalNeighborhoodSize: typicalNeighborhoodSize) - for vector in vectors { - self.insert(vector) - } - } -} - - diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/HNSWCorpusDataHandler.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/HNSWCorpusDataHandler.swift index 915e254a..9348babd 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSW/HNSWCorpusDataHandler.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/HNSWCorpusDataHandler.swift @@ -1,154 +1,154 @@ -// Copyright (c) 2024 Jim Wallace +//// Copyright (c) 2024 Jim Wallace +//// +//// Permission is hereby granted, free of charge, to any person +//// obtaining a copy of this software and associated documentation +//// files (the "Software"), to deal in the Software without +//// restriction, including without limitation the rights to use, +//// copy, modify, merge, publish, distribute, sublicense, and/or sell +//// copies of the Software, and to permit persons to whom the +//// Software is furnished to do so, subject to the following +//// conditions: +//// +//// The above copyright notice and this permission notice shall be +//// included in all copies or substantial portions of the Software. +//// +//// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +//// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +//// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +//// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +//// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +//// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +//// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +//// OTHER DEALINGS IN THE SOFTWARE. +//// +//// Created by Mingchung Xia on 2024-02-13. +//// // -// Permission is hereby granted, free of charge, to any person -// obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without -// restriction, including without limitation the rights to use, -// copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following -// conditions: +//// MARK: This is outdated since we now have the presence of a DurableHNSWCorpus but still available for reference // -// The above copyright notice and this permission notice shall be -// included in all copies or substantial portions of the Software. +//import Foundation // -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. +//final class HNSWCorpusDataHandler<Scalar: BinaryFloatingPoint & Codable> { +// var corpus: HNSWCorpus<Scalar> +// private var url: URL? +// +// init(corpus: HNSWCorpus<Scalar>, resource: String = "hnsw") { +// self.corpus = corpus +//// self.url = Bundle.module.url(forResource: resource, withExtension: "mmap") +// if let downloadsDirectory = FileManager.default.urls(for: .downloadsDirectory, in: .userDomainMask).first { +// self.url = downloadsDirectory.appendingPathComponent(resource + ".mmap") +// } +// } +// +// /// It is very difficult to get the exact size of the corpus as every class also depends on other classes +// /// The size of the memory map may not even be correct if it only stores the vectors, and the vectors are really the only "important" part +// func getCorpusSize() -> Int { +//// return heapSize(corpus) +//// return class_getInstanceSize(type(of: corpus)) +//// return MemoryLayout.size(ofValue: corpus) +// var size = 0 +// let data = corpus.encodedDocuments.base.vectors +// for vector in data { +// size += MemoryLayout.size(ofValue: vector) +// } +// return size +// } +// +// func getDictionarySize(includeKey: Bool = true) -> Int { +// var size = 0 +// let data = corpus.getDictionary() +// for (key, documentVectorPair) in data { +// if includeKey { size += MemoryLayout.size(ofValue: key) } +// size += MemoryLayout.size(ofValue: documentVectorPair.untokenizedDocument) +// size += MemoryLayout.size(ofValue: documentVectorPair.vector) +// } +// return size +// } +// +// private func heapSize(_ obj: AnyObject) -> Int { +// return malloc_size(Unmanaged.passUnretained(obj).toOpaque()) +// } +//} // -// Created by Mingchung Xia on 2024-02-13. -// - -// MARK: This is outdated since we now have the presence of a DurableHNSWCorpus but still available for reference - -import Foundation - -final class HNSWCorpusDataHandler<Scalar: BinaryFloatingPoint & Codable> { - var corpus: HNSWCorpus<Scalar> - private var url: URL? - - init(corpus: HNSWCorpus<Scalar>, resource: String = "hnsw") { - self.corpus = corpus -// self.url = Bundle.module.url(forResource: resource, withExtension: "mmap") - if let downloadsDirectory = FileManager.default.urls(for: .downloadsDirectory, in: .userDomainMask).first { - self.url = downloadsDirectory.appendingPathComponent(resource + ".mmap") - } - } - - /// It is very difficult to get the exact size of the corpus as every class also depends on other classes - /// The size of the memory map may not even be correct if it only stores the vectors, and the vectors are really the only "important" part - func getCorpusSize() -> Int { -// return heapSize(corpus) -// return class_getInstanceSize(type(of: corpus)) -// return MemoryLayout.size(ofValue: corpus) - var size = 0 - let data = corpus.encodedDocuments.base.vectors - for vector in data { - size += MemoryLayout.size(ofValue: vector) - } - return size - } - - func getDictionarySize(includeKey: Bool = true) -> Int { - var size = 0 - let data = corpus.getDictionary() - for (key, documentVectorPair) in data { - if includeKey { size += MemoryLayout.size(ofValue: key) } - size += MemoryLayout.size(ofValue: documentVectorPair.untokenizedDocument) - size += MemoryLayout.size(ofValue: documentVectorPair.vector) - } - return size - } - - private func heapSize(_ obj: AnyObject) -> Int { - return malloc_size(Unmanaged.passUnretained(obj).toOpaque()) - } -} - -extension HNSWCorpusDataHandler { - func saveMemoryMap() { - guard let url = url else { - print("URL to resource not found") - return - } - let fileManager = FileManager.default - if !fileManager.fileExists(atPath: url.path) { - fileManager.createFile(atPath: url.path, contents: nil, attributes: nil) - } - do { -// let fileHandle = try FileHandle(forWritingTo: url) +//extension HNSWCorpusDataHandler { +// func saveMemoryMap() { +// guard let url = url else { +// print("URL to resource not found") +// return +// } +// let fileManager = FileManager.default +// if !fileManager.fileExists(atPath: url.path) { +// fileManager.createFile(atPath: url.path, contents: nil, attributes: nil) +// } +// do { +//// let fileHandle = try FileHandle(forWritingTo: url) +//// +//// let count = corpus.count +//// let countData = withUnsafeBytes(of: count) { Data($0) } +//// fileHandle.write(countData) +//// +//// for pair in corpus { +//// let documentData = pair.untokenizedDocument.utf8CString.withUnsafeBufferPointer { Data(buffer: $0) } +//// fileHandle.write(documentData) +//// } +//// fileHandle.closeFile() // -// let count = corpus.count -// let countData = withUnsafeBytes(of: count) { Data($0) } -// fileHandle.write(countData) -// -// for pair in corpus { -// let documentData = pair.untokenizedDocument.utf8CString.withUnsafeBufferPointer { Data(buffer: $0) } -// fileHandle.write(documentData) -// } -// fileHandle.closeFile() - - print("Saving HNSW to file...") - /// Using the Codable conformances - let encoder = JSONEncoder() - let encoded = try encoder.encode(corpus) - try encoded.write(to: url) - } catch { - print("Error writing HNSW to file: \(error)") - } - } - - /// This saves only the untokenized documents dictionary map - func saveDictionaryMemoryMap() { - // TODO: Move from DurableHNSW extension once HNSW wrapper is created - } - - // TODO: find out how to not rebuild the index - static func loadMemoryMap(encoder: any SNLPEncoder, typicalNeighborhoodSize: Int = 20, resource: String = "hnsw") -> HNSWCorpus<Double> { - guard let url = Bundle.module.url(forResource: resource, withExtension: "mmap") else { - print("URL to resource not found") - return HNSWCorpus(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize) - } - - var loadedCorpus = HNSWCorpus(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize) - - do { -// let data = try Data(contentsOf: url, options: .alwaysMapped) -// let countData = data.prefix(MemoryLayout<Int>.size) -// let count: Int = countData.withUnsafeBytes { $0.load(as: Int.self) } -// var index = MemoryLayout<Int>.size -// -// for _ in 0..<count { -// if let stringRange = data[index...].range(of: "\0".data(using: .utf8)!) { -// let documentData = data[index..<stringRange.lowerBound] -// if let document = String(data: documentData, encoding: .utf8) { -// // Add the untokenized document to the corpus -// loadedCorpus.addUntokenizedDocument(document) -// index = stringRange.upperBound -// } -// } else { -// break -// } -// } - - /// Using the Codable conformances - print("Loading HNSW from file...") - let decoder = JSONDecoder() - let data = try Data(contentsOf: url) - loadedCorpus = try decoder.decode(HNSWCorpus<Double>.self, from: data) - } catch { - print("Error reading HNSW from file: \(error)") - } - return loadedCorpus - } - - static func loadMemoryMap(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, typicalNeighborhoodSize: Int = 20, resource: String = "hnsw") -> HNSWCorpus<Double> { - let encoder = ContextFreeEncoder<Scalar>(source: encoding) - return loadMemoryMap(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize, resource: resource) - } -} +// print("Saving HNSW to file...") +// /// Using the Codable conformances +// let encoder = JSONEncoder() +// let encoded = try encoder.encode(corpus) +// try encoded.write(to: url) +// } catch { +// print("Error writing HNSW to file: \(error)") +// } +// } +// +// /// This saves only the untokenized documents dictionary map +// func saveDictionaryMemoryMap() { +// // TODO: Move from DurableHNSW extension once HNSW wrapper is created +// } +// +// // TODO: find out how to not rebuild the index +// static func loadMemoryMap(encoder: any SNLPEncoder, typicalNeighborhoodSize: Int = 20, resource: String = "hnsw") -> HNSWCorpus<Double> { +// guard let url = Bundle.module.url(forResource: resource, withExtension: "mmap") else { +// print("URL to resource not found") +// return HNSWCorpus(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize) +// } +// +// var loadedCorpus = HNSWCorpus(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize) +// +// do { +//// let data = try Data(contentsOf: url, options: .alwaysMapped) +//// let countData = data.prefix(MemoryLayout<Int>.size) +//// let count: Int = countData.withUnsafeBytes { $0.load(as: Int.self) } +//// var index = MemoryLayout<Int>.size +//// +//// for _ in 0..<count { +//// if let stringRange = data[index...].range(of: "\0".data(using: .utf8)!) { +//// let documentData = data[index..<stringRange.lowerBound] +//// if let document = String(data: documentData, encoding: .utf8) { +//// // Add the untokenized document to the corpus +//// loadedCorpus.addUntokenizedDocument(document) +//// index = stringRange.upperBound +//// } +//// } else { +//// break +//// } +//// } +// +// /// Using the Codable conformances +// print("Loading HNSW from file...") +// let decoder = JSONDecoder() +// let data = try Data(contentsOf: url) +// loadedCorpus = try decoder.decode(HNSWCorpus<Double>.self, from: data) +// } catch { +// print("Error reading HNSW from file: \(error)") +// } +// return loadedCorpus +// } +// +// static func loadMemoryMap(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, typicalNeighborhoodSize: Int = 20, resource: String = "hnsw") -> HNSWCorpus<Double> { +// let encoder = ContextFreeEncoder<Scalar>(source: encoding) +// return loadMemoryMap(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize, resource: resource) +// } +//} diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/RNG/MersenneTwisterRNG.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/RNG/MersenneTwisterRNG.swift index f96caec7..5eabd7ac 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSW/RNG/MersenneTwisterRNG.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/RNG/MersenneTwisterRNG.swift @@ -21,6 +21,10 @@ // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR // OTHER DEALINGS IN THE SOFTWARE. // +// See the https://github.com/JadenGeller/similarity-topology.git +// for reference. The code is used with permission from the author +// under the MIT License. +// // Created by Mingchung Xia on 2024-01-28. // diff --git a/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift b/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift index 56be08db..da751bce 100644 --- a/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift +++ b/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift @@ -34,6 +34,14 @@ final class DurableHNSWCorpusTests: XCTestCase { let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb") try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil) workingDirectoryPath = FilePath(directoryURL.path) + + /// This commented out code alternatively works in the XCode bundle resource environment +// guard let resourcesPath = Bundle.module.resourcePath else { fatalError("Failed to find resource path.") } +// let resourcesDirectoryURL = URL(fileURLWithPath: resourcesPath).appendingPathComponent("lmdb") +// let fileManager = FileManager.default +// try fileManager.createDirectory(at: resourcesDirectoryURL, withIntermediateDirectories: true, attributes: nil) +// print("Resources directory: \(resourcesDirectoryURL)") +// workingDirectoryPath = FilePath(resourcesDirectoryURL.path) } func testBasicExample() throws { diff --git a/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift b/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift index d6b39798..13618f4e 100644 --- a/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift +++ b/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift @@ -5,9 +5,6 @@ import System @testable import SwiftNLP final class EphemeralHNSWCorpusTests: XCTestCase { - // MARK: There is also an HNSWCorpusDataHandler class which can store an EphemeralHNSWCorpus into a memory map - /// However, it is not recommended to use this for large datasets since it uses a currently slow coding protocol conformance with JSONEncoder/Decoder - // MARK: EphemeralHNSWCorpus can also be used as its typealias HNSWCorpus // Load a small set of documents and confirm that corpus and dictionary are updated accordingly -- GitLab