diff --git a/Sources/SwiftNLP/1. Data Collection/CartesianDistanceMetric.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/CartesianDistanceMetric.swift similarity index 100% rename from Sources/SwiftNLP/1. Data Collection/CartesianDistanceMetric.swift rename to Sources/SwiftNLP/1. Data Collection/HNSW/CartesianDistanceMetric.swift diff --git a/Sources/SwiftNLP/1. Data Collection/DeterministicSampleVectorIndex + Codable.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicEphemeralVectorIndex + Codable.swift similarity index 80% rename from Sources/SwiftNLP/1. Data Collection/DeterministicSampleVectorIndex + Codable.swift rename to Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicEphemeralVectorIndex + Codable.swift index 715c50e0ee73dff4ae64798737806c854aaa0d5a..cd5b319cab551c331b5064599553b25419d44081 100644 --- a/Sources/SwiftNLP/1. Data Collection/DeterministicSampleVectorIndex + Codable.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicEphemeralVectorIndex + Codable.swift @@ -1,5 +1,5 @@ // -// DeterministicSampleVectorIndex + Codable.swift +// DeterministicEphemeralVectorIndex + Codable.swift // // // Created by Mingchung Xia on 2024-02-07. @@ -7,7 +7,7 @@ import Foundation -extension DeterministicSampleVectorIndex: Encodable where Vector: Encodable { +extension DeterministicEphemeralVectorIndex: Encodable where Vector: Encodable { enum CodingKeys: String, CodingKey { case typicalNeighborhoodSize case vectors @@ -20,7 +20,7 @@ extension DeterministicSampleVectorIndex: Encodable where Vector: Encodable { } } -extension DeterministicSampleVectorIndex: Decodable where Vector: Decodable { +extension DeterministicEphemeralVectorIndex: Decodable where Vector: Decodable { public init(from decoder: Decoder) throws { let container = try decoder.container(keyedBy: CodingKeys.self) let typicalNeighborhoodSize = try container.decode(Int.self, forKey: .typicalNeighborhoodSize) diff --git a/Sources/SwiftNLP/1. Data Collection/DeterministicSampleVectorIndex.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicEphemeralVectorIndex.swift similarity index 94% rename from Sources/SwiftNLP/1. Data Collection/DeterministicSampleVectorIndex.swift rename to Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicEphemeralVectorIndex.swift index d30df72a0989b7399e00deb7445b00651802738a..1029b5846c14a2b171415c02bfd1e49e960f0164 100644 --- a/Sources/SwiftNLP/1. Data Collection/DeterministicSampleVectorIndex.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicEphemeralVectorIndex.swift @@ -30,9 +30,9 @@ import PriorityHeapAlgorithms import HNSWAlgorithm import HNSWEphemeral -// It may be useful to conform to Sequence and/or Collection +// MARK: This uses the temporary EmphermalVectorIndex -public struct DeterministicSampleVectorIndex<Vector: Collection & Codable> where Vector.Element: BinaryFloatingPoint { +public struct DeterministicEphemeralVectorIndex<Vector: Collection & Codable> where Vector.Element: BinaryFloatingPoint { public typealias Index = EphemeralVectorIndex<Int, Int, CartesianDistanceMetric<Vector>, Void> public var base: Index diff --git a/Sources/SwiftNLP/1. Data Collection/DeterministicRandomNumberGenerator.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicRandomNumberGenerator.swift similarity index 100% rename from Sources/SwiftNLP/1. Data Collection/DeterministicRandomNumberGenerator.swift rename to Sources/SwiftNLP/1. Data Collection/HNSW/DeterministicRandomNumberGenerator.swift diff --git a/Sources/SwiftNLP/1. Data Collection/HNSWCorpusDataHandler.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/HNSWCorpusDataHandler.swift similarity index 50% rename from Sources/SwiftNLP/1. Data Collection/HNSWCorpusDataHandler.swift rename to Sources/SwiftNLP/1. Data Collection/HNSW/HNSWCorpusDataHandler.swift index 74497a4e6c725e7416713313bf65aa6a4b6209ac..7911ebbf91445c07b333d67ba336615ea75f50f9 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSWCorpusDataHandler.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/HNSWCorpusDataHandler.swift @@ -61,56 +61,70 @@ extension HNSWCorpusDataHandler { fileManager.createFile(atPath: url.path, contents: nil, attributes: nil) } do { - let fileHandle = try FileHandle(forWritingTo: url) - - let count = corpus.count - let countData = withUnsafeBytes(of: count) { Data($0) } - fileHandle.write(countData) - - // TODO: We may need to edit the HNSWCorpus iterator to actually iterate over its dictionary as it would be useful here - let data = corpus.getDictionary() - for (key, documentVectorPair) in data { - let documentData = documentVectorPair.untokenizedDocument.utf8CString.withUnsafeBufferPointer { Data(buffer: $0) } - fileHandle.write(documentData) - } - fileHandle.closeFile() +// let fileHandle = try FileHandle(forWritingTo: url) +// +// let count = corpus.count +// let countData = withUnsafeBytes(of: count) { Data($0) } +// fileHandle.write(countData) +// +// // TODO: We may need to edit the HNSWCorpus iterator to actually iterate over its dictionary as it would be useful here +// let data = corpus.getDictionary() +// for (key, documentVectorPair) in data { +// let documentData = documentVectorPair.untokenizedDocument.utf8CString.withUnsafeBufferPointer { Data(buffer: $0) } +// fileHandle.write(documentData) +// } +// fileHandle.closeFile() + print("Saving HNSW to file...") + /// Using the Codable conformances + let encoder = JSONEncoder() + let encoded = try encoder.encode(corpus) + try encoded.write(to: url) } catch { print("Error writing HNSW to file: \(error)") } } - // TODO: Change the return from Double to Scalar - // TODO: Change to encoder parameter (any SNLPEncoder) - static func loadMemoryMap(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, resource: String = "hnsw") -> HNSWCorpus<Double> { - let _documentEncoder = ContextFreeEncoder(source: encoding) - + // TODO: find out how to not rebuild the index + static func loadMemoryMap(encoder: any SNLPEncoder, typicalNeighborhoodSize: Int = 20, resource: String = "hnsw") -> HNSWCorpus<Double> { guard let url = Bundle.module.url(forResource: resource, withExtension: "mmap") else { print("URL to resource not found") - return HNSWCorpus(encoder: _documentEncoder) + return HNSWCorpus(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize) } - var loadedCorpus = HNSWCorpus(encoder: _documentEncoder) + + var loadedCorpus = HNSWCorpus(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize) do { - let data = try Data(contentsOf: url, options: .alwaysMapped) - let countData = data.prefix(MemoryLayout<Int>.size) - let count: Int = countData.withUnsafeBytes { $0.load(as: Int.self) } - var index = MemoryLayout<Int>.size - - for _ in 0..<count { - if let stringRange = data[index...].range(of: "\0".data(using: .utf8)!) { - let documentData = data[index..<stringRange.lowerBound] - if let document = String(data: documentData, encoding: .utf8) { - // Add the untokenized document to the corpus - loadedCorpus.addUntokenizedDocument(document) - index = stringRange.upperBound - } - } else { - break - } - } +// let data = try Data(contentsOf: url, options: .alwaysMapped) +// let countData = data.prefix(MemoryLayout<Int>.size) +// let count: Int = countData.withUnsafeBytes { $0.load(as: Int.self) } +// var index = MemoryLayout<Int>.size +// +// for _ in 0..<count { +// if let stringRange = data[index...].range(of: "\0".data(using: .utf8)!) { +// let documentData = data[index..<stringRange.lowerBound] +// if let document = String(data: documentData, encoding: .utf8) { +// // Add the untokenized document to the corpus +// loadedCorpus.addUntokenizedDocument(document) +// index = stringRange.upperBound +// } +// } else { +// break +// } +// } + + /// Using the Codable conformances + print("Loading HNSW from file...") + let decoder = JSONDecoder() + let data = try Data(contentsOf: url) + loadedCorpus = try decoder.decode(HNSWCorpus<Double>.self, from: data) } catch { print("Error reading HNSW from file: \(error)") } return loadedCorpus } + + static func loadMemoryMap(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, typicalNeighborhoodSize: Int = 20, resource: String = "hnsw") -> HNSWCorpus<Double> { + let encoder = ContextFreeEncoder<Scalar>(source: encoding) + return loadMemoryMap(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize, resource: resource) + } } diff --git a/Sources/SwiftNLP/1. Data Collection/SeedableRandomNumberGenerator.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/SeedableRandomNumberGenerator.swift similarity index 100% rename from Sources/SwiftNLP/1. Data Collection/SeedableRandomNumberGenerator.swift rename to Sources/SwiftNLP/1. Data Collection/HNSW/SeedableRandomNumberGenerator.swift diff --git a/Sources/SwiftNLP/1. Data Collection/HNSWCorpus + Codable.swift b/Sources/SwiftNLP/1. Data Collection/HNSWCorpus + Codable.swift index 3b29b1e222e93e5d766cea71dbc4c5aceb510795..aee1feba2057eee9b0e7ae66614c7aa0984a69c0 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSWCorpus + Codable.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSWCorpus + Codable.swift @@ -5,21 +5,21 @@ // Created by Mingchung Xia on 2024-02-07. // -//import Foundation +import Foundation // MARK: Decodable conformance is in HNSWCorpus -//extension HNSWCorpus: Codable { -// enum CodingKeys: String, CodingKey { -// case _documentEncoder -// case zeroes -// case encodedDocuments -// } -// -// func encode(to encoder: Encoder) throws { -// var container = encoder.container(keyedBy: CodingKeys.self) -// try container.encode(_documentEncoder, forKey: ._documentEncoder) -// try container.encode(zeroes, forKey: .zeroes) -// try container.encode(encodedDocuments, forKey: .encodedDocuments) -// } -//} +extension HNSWCorpus: Codable { + enum CodingKeys: String, CodingKey { + case _documentEncoder + case encodedDocuments + case dictionary + } + + func encode(to encoder: Encoder) throws { + var container = encoder.container(keyedBy: CodingKeys.self) + try container.encode(_documentEncoder, forKey: ._documentEncoder) + try container.encode(encodedDocuments, forKey: .encodedDocuments) + try container.encode(dictionary, forKey: .dictionary) + } +} diff --git a/Sources/SwiftNLP/1. Data Collection/HNSWCorpus + Dictionary.swift b/Sources/SwiftNLP/1. Data Collection/HNSWCorpus + Dictionary.swift index e4eb820c5a71ba592b3bafe3ba795dd94b841fcc..e8fb97347ba1cb821b34b2bade98132711186e9e 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSWCorpus + Dictionary.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSWCorpus + Dictionary.swift @@ -49,3 +49,22 @@ extension HNSWCorpus { ) } } + +extension HNSWCorpus.DocumentVectorPair: Codable where Scalar: Codable { + enum CodingKeys: String, CodingKey { + case untokenizedDocument + case vector + } + + internal init(from decoder: Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + untokenizedDocument = try container.decode(String.self, forKey: .untokenizedDocument) + vector = try container.decode([Scalar].self, forKey: .vector) + } + + internal func encode(to encoder: Encoder) throws { + var container = encoder.container(keyedBy: CodingKeys.self) + try container.encode(untokenizedDocument, forKey: .untokenizedDocument) + try container.encode(vector, forKey: .vector) + } +} diff --git a/Sources/SwiftNLP/1. Data Collection/HNSWCorpus.swift b/Sources/SwiftNLP/1. Data Collection/HNSWCorpus.swift index 5a065135a896ff7818005d3fba06cbb8abcc828d..22d6cdc951d370f1d7328f8b72ac91cd0816ed6d 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSWCorpus.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSWCorpus.swift @@ -24,35 +24,41 @@ import Foundation class HNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus { - + public enum MemoryDuration { + case ephemeral + case durable + } + + public typealias HNSWDictionary = [Int: DocumentVectorPair] + internal var _documentEncoder: any SNLPEncoder var zeroes: [Scalar] { _documentEncoder.zeroes as! [Scalar] } - var encodedDocuments: DeterministicSampleVectorIndex<[Scalar]> + var encodedDocuments: DeterministicEphemeralVectorIndex<[Scalar]> var count: Int { encodedDocuments.base.vectors.count } // Keeps track of the original document for client code - var dictionary: [Int: DocumentVectorPair] = [:] + var dictionary: HNSWDictionary = [:] // typicalNeighbourhoodSize = 20 is a standard benchmark init(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, scalar: Scalar.Type = Double.self, typicalNeighborhoodSize: Int = 20) { _documentEncoder = ContextFreeEncoder(source: encoding) - encodedDocuments = DeterministicSampleVectorIndex<[Scalar]>(typicalNeighborhoodSize: typicalNeighborhoodSize) + encodedDocuments = DeterministicEphemeralVectorIndex<[Scalar]>(typicalNeighborhoodSize: typicalNeighborhoodSize) } init(encoder: any SNLPEncoder, scalar: Scalar.Type = Double.self, typicalNeighborhoodSize: Int = 20) { _documentEncoder = encoder - encodedDocuments = DeterministicSampleVectorIndex<[Scalar]>(typicalNeighborhoodSize: typicalNeighborhoodSize) + encodedDocuments = DeterministicEphemeralVectorIndex<[Scalar]>(typicalNeighborhoodSize: typicalNeighborhoodSize) } // Decodable conformance -// required init(from decoder: Decoder) throws { -// let container = try decoder.container(keyedBy: CodingKeys.self) -// _documentEncoder = try container.decode(ContextFreeEncoder<Scalar>.self, forKey: ._documentEncoder) -// zeroes = try container.decode([Scalar].self, forKey: .zeroes) -// encodedDocuments = try container.decode(DeterministicSampleVectorIndex<[Scalar]>.self, forKey: .encodedDocuments) -// } + required init(from decoder: Decoder) throws { + let container = try decoder.container(keyedBy: CodingKeys.self) + _documentEncoder = try container.decode(ContextFreeEncoder<Scalar>.self, forKey: ._documentEncoder) + encodedDocuments = try container.decode(DeterministicEphemeralVectorIndex<[Scalar]>.self, forKey: .encodedDocuments) + dictionary = try container.decode(HNSWDictionary.self, forKey: .dictionary) + } @inlinable func addUntokenizedDocument(_ document: String) { diff --git a/Sources/SwiftNLP/2. Encoding/ContextFreeEncoder + File IO .swift b/Sources/SwiftNLP/2. Encoding/ContextFreeEncoder + File IO .swift index cd5b35ef7097c9c900f316609571e037ca94417e..5f186318c50b0a0ab949b494fcce9fbd10bec4bb 100644 --- a/Sources/SwiftNLP/2. Encoding/ContextFreeEncoder + File IO .swift +++ b/Sources/SwiftNLP/2. Encoding/ContextFreeEncoder + File IO .swift @@ -62,7 +62,7 @@ extension ContextFreeEncoder { // These use memory mapping to load the values in more quickly // TODO: Validate that this actually works on other systems... could easily be some issues - static func readDictionaryFromFile(_ url: URL) -> [String : [Scalar]] { + static func readDictionaryFromFile(_ url: URL, width: Int = 50) -> [String : [Scalar]] { //let fileURL = URL(fileURLWithPath: filename) var result: [String : [Scalar]] @@ -87,7 +87,7 @@ extension ContextFreeEncoder { index = stringRange.upperBound // Read the values - let valuesData = data[index..<(index + 50 * MemoryLayout<Double>.size)] + let valuesData = data[index..<(index + width * MemoryLayout<Double>.size)] let values = valuesData.withUnsafeBytes { Array($0.bindMemory(to: Scalar.self)) } // Add the key-value pair to the dictionary @@ -95,7 +95,7 @@ extension ContextFreeEncoder { //debugPrint("\(key) -> \(values[0])") } - index += 50 * MemoryLayout<Double>.size //TODO: Why is this magical 50 here? + index += width * MemoryLayout<Double>.size } else { break } diff --git a/Sources/SwiftNLP/Resources/hnsw_testbasicexample.mmap b/Sources/SwiftNLP/Resources/hnsw_testbasicexample.mmap index d0d92e57876b3db107de88afb6eb3decbb9755a5..27ec8504e9b276b85b994e7a800b3d702ad1facc 100644 Binary files a/Sources/SwiftNLP/Resources/hnsw_testbasicexample.mmap and b/Sources/SwiftNLP/Resources/hnsw_testbasicexample.mmap differ diff --git a/Sources/SwiftNLP/Resources/hnsw_testbasicqueryexample.mmap b/Sources/SwiftNLP/Resources/hnsw_testbasicqueryexample.mmap new file mode 100644 index 0000000000000000000000000000000000000000..fde0a6d18e6963661d8f6dafe5eaa7d62058b3b7 Binary files /dev/null and b/Sources/SwiftNLP/Resources/hnsw_testbasicqueryexample.mmap differ diff --git a/Sources/SwiftNLP/Resources/hnsw_testbiggerexample.mmap b/Sources/SwiftNLP/Resources/hnsw_testbiggerexample.mmap deleted file mode 100644 index 4f212d4768d1504de7684d8e4fc27cae3d1e9db3..0000000000000000000000000000000000000000 Binary files a/Sources/SwiftNLP/Resources/hnsw_testbiggerexample.mmap and /dev/null differ diff --git a/Sources/SwiftNLP/Resources/hnsw_testsubreddit.mmap b/Sources/SwiftNLP/Resources/hnsw_testsubreddit.mmap index 48d76cb321a78f73ba95e222efcb553469df352f..56ba6ddb016fd6d65ae86aeaf7f82666a3d07941 100644 Binary files a/Sources/SwiftNLP/Resources/hnsw_testsubreddit.mmap and b/Sources/SwiftNLP/Resources/hnsw_testsubreddit.mmap differ diff --git a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift index be33da78119d3fed2d4543105264896c6f08f094..d3696cbc06f36ae45841b666a52e9c0f3f916864 100644 --- a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift +++ b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift @@ -4,7 +4,21 @@ import Foundation @testable import SwiftNLP final class HNSWTests: XCTestCase { + /* + MARK: To save a memory map HNSWCorpus (example): + + let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) + let corpus = HNSWCorpus<Double>(documentEncoder: _documentEncoder) + corpus.addUntokenizedDocuments(docs) + let dataHandler = HNSWCorpusDataHandler(corpus: corpus, resource: "hnsw_testbasicexample") + dataHandler.saveMemoryMap() + */ + /* + MARK: To load a memory map HNSWCorpus (example): + + let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testbasicexample") + */ // Load a small set of documents and confirm that corpus and dictionary are updated accordingly func testBasicExample() throws { @@ -14,16 +28,16 @@ final class HNSWTests: XCTestCase { "that enable us to train deep learning algorithms to learn like the human brain." ] - var corpus = HNSWCorpus(encoding: .glove6B50d) - corpus.addUntokenizedDocuments(docs) - - let dataHandler = HNSWCorpusDataHandler(corpus: corpus, resource: "hnsw_testbasicexample") - let corpusSize = dataHandler.getCorpusSize() - let dictionarySize = dataHandler.getDictionarySize(includeKey: false) - print("Corpus size: \(corpusSize) bytes") - print("Dictionary size: \(dictionarySize) bytes") - dataHandler.saveMemoryMap() - // let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testbasicexample") +// var corpus = HNSWCorpus(encoding: .glove6B50d) +// corpus.addUntokenizedDocuments(docs) +// +// let dataHandler = HNSWCorpusDataHandler(corpus: corpus, resource: "hnsw_testbasicexample") +// let corpusSize = dataHandler.getCorpusSize() +// let dictionarySize = dataHandler.getDictionarySize(includeKey: false) +// print("Corpus size: \(corpusSize) bytes") +// print("Dictionary size: \(dictionarySize) bytes") +// dataHandler.saveMemoryMap() + let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testbasicexample") XCTAssert(corpus.count == 3) @@ -79,103 +93,92 @@ final class HNSWTests: XCTestCase { } func testSubreddit() async throws { - - guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else { - fatalError("Failed to find waterloo_submissions.zst in test bundle.") - } - guard let submissionsData = try? Data(contentsOf: submissionsURL) else { - fatalError("Failed to load waterloo_submissions.zst from test bundle.") - } - - let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData) - - var corpus = HNSWCorpus(encoding: .glove6B50d) - - for submission in submissions { - if let text = submission.selftext { - corpus.addUntokenizedDocument(text) - } - } - - let dataHandler = HNSWCorpusDataHandler(corpus: corpus, resource: "hnsw_testsubreddit") - let corpusSize = dataHandler.getCorpusSize() - let dictionarySize = dataHandler.getDictionarySize(includeKey: false) - print("Corpus size: \(corpusSize) bytes") - print("Dictionary size: \(dictionarySize) bytes") - dataHandler.saveMemoryMap() +// +// guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else { +// fatalError("Failed to find waterloo_submissions.zst in test bundle.") +// } +// guard let submissionsData = try? Data(contentsOf: submissionsURL) else { +// fatalError("Failed to load waterloo_submissions.zst from test bundle.") +// } +// +// let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData) +// +// var corpus = HNSWCorpus(encoding: .glove6B50d) +// +// for submission in submissions { +// if let text = submission.selftext { +// corpus.addUntokenizedDocument(text) +// } +// } +// +// let dataHandler = HNSWCorpusDataHandler(corpus: corpus, resource: "hnsw_testsubreddit") +// let corpusSize = dataHandler.getCorpusSize() +// let dictionarySize = dataHandler.getDictionarySize(includeKey: false) +// print("Corpus size: \(corpusSize) bytes") +// print("Dictionary size: \(dictionarySize) bytes") +// dataHandler.saveMemoryMap() //print("Loaded \(corpus.count) documents.") + let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testsubreddit") XCTAssert(corpus.count == 17999) } - func testTypicalNeighborhoodSizes() throws { - // TODO: Debug - Fatal error: Double value cannot be converted to Int because it is outside the representable range -// let twentyQuotes = [ -// "Imagination is more important than knowledge. - Albert Einstein", -// "The greatest enemy of knowledge is not ignorance, it is the illusion of knowledge. - Stephen Hawking", -// "If I have seen further it is by standing on the shoulders of giants. - Isaac Newton", -// "The universe is a wondrous place! The faster you create unbreakable code, the faster the universe creates people that can break it. - Richard Feynman", -// "Science is the belief in the ignorance of experts. - Richard Feynman", -// "The saddest aspect of life right now is that science gathers knowledge faster than society gathers wisdom. - Isaac Asimov", -// "Science is the poetry of reality. - Richard Dawkins", -// "To raise new questions, new possibilities, to regard old problems from a new angle, requires creative imagination and marks real advance in science. - Albert Einstein", -// "The scientist does not study nature because it is useful; he studies it because he delights in it, and he delights in it because it is beautiful. - Henri Poincaré", -// "Nothing in life is to be feared, it is only to be understood. Now is the time to understand more, so that we may fear less. - Marie Curie", -// "An experiment is a question which science poses to Nature, and a measurement is the recording of Nature’s answer. - Max Planck", -// "If you wish to make an apple pie from scratch, you must first invent the universe. - Carl Sagan", -// "The function of science fiction is not always to predict the future but sometimes to prevent it. - Frank Herbert", -// "Science is what we understand well enough to explain to a computer. Art is everything else we do. - Donald Knuth", -// "In science one tries to tell people, in such a way as to be understood by everyone, something that no one ever knew before. But in poetry, it's the exact opposite. - Paul Dirac", -// "Science is a way of thinking much more than it is a body of knowledge. - Carl Sagan", -// "Research is what I’m doing when I don’t know what I’m doing. - Wernher von Braun", -// "The most beautiful thing we can experience is the mysterious. It is the source of all true art and science. - Albert Einstein", -// "One, remember to look up at the stars and not down at your feet. Two, never give up work. Work gives you meaning and purpose and life is empty without it. Three, if you are lucky enough to find love, remember it is there and don't throw it away. - Stephen Hawking", -// "All science is either physics or stamp collecting. - Ernest Rutherford" -// ] -// -// let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) -// -// for typicalNeighborhoodSize in 0..<100 { -// let start = Date() -// -// var corpus = HNSWCorpus(encoder: _documentEncoder, typicalNeighborhoodSize: typicalNeighborhoodSize) -// corpus.addUntokenizedDocuments(twentyQuotes) -// -// let end = Date() -// let runtime = end.timeIntervalSince(start) -// -// let size = MemoryLayout.size(ofValue: corpus) -// print("Typical Neighbor Size: \(typicalNeighborhoodSize)") -// print("Approximate memory footprint: \(size) bytes") -// print("Runtime: \(runtime) seconds") -// } + func testTypicalNeighborhoodSizeExecutionTime() throws { +// let range = 0..<100 + let values = [2, 4, 8, 20, 32, 128, 512, 1000] + + let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) + + for typicalNeighborhoodSize in values { + let start = Date() + + let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, typicalNeighborhoodSize: typicalNeighborhoodSize, resource: "hnsw_testbiggerexample") + + let buildend = Date() + let buildtime = buildend.timeIntervalSince(start) + + print("buildtime for typicalNeighborhoodSize = \(typicalNeighborhoodSize): \(buildtime) seconds") + + let query = "I like to read about new technology and artificial intelligence" + let queryVector = _documentEncoder.encodeToken(query) + + /// only runtime matters so we do not care about success + let _ = try? corpus.encodedDocuments.find(near: queryVector, limit: 5) + + let searchend = Date() + let searchtime = searchend.timeIntervalSince(buildend) + + print("searchtime for typicalNeighborhoodSize = \(typicalNeighborhoodSize): \(searchtime) seconds") + } } // Refer to AllMiniLM_sampleTest.swift for reference func testBasicQueryExample() async throws { - let docs = [ - "The quick brown fox jumps over the lazy dog", - "I enjoy taking long walks along the beach at sunset", - "Advances in neural networks have enabled new AI capabilities", - "The stock market experienced a significant downturn last week", - "Cooking a good meal can be both an art and a science", - "The exploration of space is both challenging and rewarding", - "Machine learning models are becoming increasingly sophisticated", - "I love reading about history and ancient civilizations" - ] +// let docs = [ +// "The quick brown fox jumps over the lazy dog", +// "I enjoy taking long walks along the beach at sunset", +// "Advances in neural networks have enabled new AI capabilities", +// "The stock market experienced a significant downturn last week", +// "Cooking a good meal can be both an art and a science", +// "The exploration of space is both challenging and rewarding", +// "Machine learning models are becoming increasingly sophisticated", +// "I love reading about history and ancient civilizations" +// ] let query = "I like to read about new technology and artificial intelligence" - +// let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) - var corpus = HNSWCorpus(encoder: _documentEncoder) - corpus.addUntokenizedDocuments(docs) - - let dataHandler = HNSWCorpusDataHandler(corpus: corpus) - let corpusSize = dataHandler.getCorpusSize() - let dictionarySize = dataHandler.getDictionarySize(includeKey: false) - print("Corpus size: \(corpusSize) bytes") - print("Dictionary size: \(dictionarySize) bytes") +// var corpus = HNSWCorpus(encoder: _documentEncoder) +// corpus.addUntokenizedDocuments(docs) +// +// let dataHandler = HNSWCorpusDataHandler(corpus: corpus, resource: "hnsw_testbasicqueryexample") +// let corpusSize = dataHandler.getCorpusSize() +// let dictionarySize = dataHandler.getDictionarySize(includeKey: false) +// print("Corpus size: \(corpusSize) bytes") +// print("Dictionary size: \(dictionarySize) bytes") +// dataHandler.saveMemoryMap() + let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoder: _documentEncoder, resource: "hnsw_testbasicqueryexample") do { print("Attempting to query corpus.encodedDocuments.find()...") @@ -230,7 +233,7 @@ final class HNSWTests: XCTestCase { // print("Corpus size: \(corpusSize) bytes") // print("Dictionary size: \(dictionarySize) bytes") - let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testbiggerexample") + let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoder: _documentEncoder, resource: "hnsw_testbiggerexample") do { print("Attempting to query corpus.encodedDocuments.find()...") @@ -248,7 +251,6 @@ final class HNSWTests: XCTestCase { } } - // TODO: Get HNSWCorpus from memory map func testSubredditQueryExample() async throws { // guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else { @@ -268,7 +270,7 @@ final class HNSWTests: XCTestCase { // corpus.addUntokenizedDocument(text) // } // } - let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testsubreddit") + let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoder: _documentEncoder, resource: "hnsw_testsubreddit") let query = "Mr. Goose is a very important figure at the University of Waterloo." @@ -278,8 +280,6 @@ final class HNSWTests: XCTestCase { print("Corpus size: \(corpusSize) bytes") print("Dictionary size: \(dictionarySize) bytes") - // Load from memory map here - do { print("Attempting to query corpus.encodedDocuments.find()...")