diff --git a/Sources/SwiftNLP/1. Data Collection/HNSWCorpusDataHandler.swift b/Sources/SwiftNLP/1. Data Collection/HNSWCorpusDataHandler.swift index be7d814a3ea90f63e900702fa043eca1070ac031..74497a4e6c725e7416713313bf65aa6a4b6209ac 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSWCorpusDataHandler.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSWCorpusDataHandler.swift @@ -9,14 +9,53 @@ import Foundation final class HNSWCorpusDataHandler<Scalar: BinaryFloatingPoint & Codable> { var corpus: HNSWCorpus<Scalar> + private var url: URL? - init(corpus: HNSWCorpus<Scalar>) { + init(corpus: HNSWCorpus<Scalar>, resource: String = "hnsw") { self.corpus = corpus + // TODO: Try to fix this to work in the Bundle (does not write but can read) +// self.url = Bundle.module.url(forResource: resource, withExtension: "mmap") + if let downloadsDirectory = FileManager.default.urls(for: .downloadsDirectory, in: .userDomainMask).first { + self.url = downloadsDirectory.appendingPathComponent(resource + ".mmap") + } + } + + /// It is very difficult to get the exact size of the corpus as every class also depends on other classes + /// The size of the memory map may not even be correct if it only stores the vectors, and the vectors are really the only "important" part + func getCorpusSize() -> Int { +// return heapSize(corpus) +// return class_getInstanceSize(type(of: corpus)) +// return MemoryLayout.size(ofValue: corpus) + var size = 0 + let data = corpus.encodedDocuments.base.vectors + for vector in data { + size += MemoryLayout.size(ofValue: vector) + } + return size } + + func getDictionarySize(includeKey: Bool = true) -> Int { + var size = 0 + let data = corpus.getDictionary() + for (key, documentVectorPair) in data { + if includeKey { size += MemoryLayout.size(ofValue: key) } + size += MemoryLayout.size(ofValue: documentVectorPair.untokenizedDocument) + size += MemoryLayout.size(ofValue: documentVectorPair.vector) + } + return size + } + +// private func heapSize(_ obj: AnyObject) -> Int { +// return malloc_size(Unmanaged.passUnretained(obj).toOpaque()) +// } } extension HNSWCorpusDataHandler { - func saveMemoryMap(url: URL) { + func saveMemoryMap() { + guard let url = url else { + print("URL to resource not found") + return + } let fileManager = FileManager.default if !fileManager.fileExists(atPath: url.path) { fileManager.createFile(atPath: url.path, contents: nil, attributes: nil) @@ -41,8 +80,15 @@ extension HNSWCorpusDataHandler { } // TODO: Change the return from Double to Scalar - static func loadMemoryMap(url: URL, encoder: any SNLPEncoder) -> HNSWCorpus<Double> { - var loadedCorpus = HNSWCorpus(encoder: encoder) + // TODO: Change to encoder parameter (any SNLPEncoder) + static func loadMemoryMap(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, resource: String = "hnsw") -> HNSWCorpus<Double> { + let _documentEncoder = ContextFreeEncoder(source: encoding) + + guard let url = Bundle.module.url(forResource: resource, withExtension: "mmap") else { + print("URL to resource not found") + return HNSWCorpus(encoder: _documentEncoder) + } + var loadedCorpus = HNSWCorpus(encoder: _documentEncoder) do { let data = try Data(contentsOf: url, options: .alwaysMapped) diff --git a/Sources/SwiftNLP/Resources/hnsw_testbasicexample.mmap b/Sources/SwiftNLP/Resources/hnsw_testbasicexample.mmap new file mode 100644 index 0000000000000000000000000000000000000000..d0d92e57876b3db107de88afb6eb3decbb9755a5 Binary files /dev/null and b/Sources/SwiftNLP/Resources/hnsw_testbasicexample.mmap differ diff --git a/Sources/SwiftNLP/Resources/hnsw_testbiggerexample.mmap b/Sources/SwiftNLP/Resources/hnsw_testbiggerexample.mmap new file mode 100644 index 0000000000000000000000000000000000000000..4f212d4768d1504de7684d8e4fc27cae3d1e9db3 Binary files /dev/null and b/Sources/SwiftNLP/Resources/hnsw_testbiggerexample.mmap differ diff --git a/Sources/SwiftNLP/Resources/hnsw_testsubreddit.mmap b/Sources/SwiftNLP/Resources/hnsw_testsubreddit.mmap new file mode 100644 index 0000000000000000000000000000000000000000..48d76cb321a78f73ba95e222efcb553469df352f Binary files /dev/null and b/Sources/SwiftNLP/Resources/hnsw_testsubreddit.mmap differ diff --git a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift index 2b364da02ceafcec2e15edc031c573213a3e1cb4..be33da78119d3fed2d4543105264896c6f08f094 100644 --- a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift +++ b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift @@ -17,8 +17,13 @@ final class HNSWTests: XCTestCase { var corpus = HNSWCorpus(encoding: .glove6B50d) corpus.addUntokenizedDocuments(docs) - let size = MemoryLayout.size(ofValue: corpus) - print("Approximate memory footprint: \(size) bytes") + let dataHandler = HNSWCorpusDataHandler(corpus: corpus, resource: "hnsw_testbasicexample") + let corpusSize = dataHandler.getCorpusSize() + let dictionarySize = dataHandler.getDictionarySize(includeKey: false) + print("Corpus size: \(corpusSize) bytes") + print("Dictionary size: \(dictionarySize) bytes") + dataHandler.saveMemoryMap() + // let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testbasicexample") XCTAssert(corpus.count == 3) @@ -57,8 +62,13 @@ final class HNSWTests: XCTestCase { var corpus = HNSWCorpus(encoding: .glove6B50d) corpus.addUntokenizedDocuments(twentyQuotes) - let size = MemoryLayout.size(ofValue: corpus) - print("Approximate memory footprint: \(size) bytes") + let dataHandler = HNSWCorpusDataHandler(corpus: corpus, resource: "hnsw_testbiggerexample") + let corpusSize = dataHandler.getCorpusSize() + let dictionarySize = dataHandler.getDictionarySize(includeKey: false) + print("Corpus size: \(corpusSize) bytes") + print("Dictionary size: \(dictionarySize) bytes") + dataHandler.saveMemoryMap() +// let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testbiggerexample") XCTAssertEqual(corpus.count, 20) @@ -87,8 +97,12 @@ final class HNSWTests: XCTestCase { } } - let size = MemoryLayout.size(ofValue: corpus) - print("Approximate memory footprint: \(size) bytes") + let dataHandler = HNSWCorpusDataHandler(corpus: corpus, resource: "hnsw_testsubreddit") + let corpusSize = dataHandler.getCorpusSize() + let dictionarySize = dataHandler.getDictionarySize(includeKey: false) + print("Corpus size: \(corpusSize) bytes") + print("Dictionary size: \(dictionarySize) bytes") + dataHandler.saveMemoryMap() //print("Loaded \(corpus.count) documents.") XCTAssert(corpus.count == 17999) @@ -157,8 +171,11 @@ final class HNSWTests: XCTestCase { var corpus = HNSWCorpus(encoder: _documentEncoder) corpus.addUntokenizedDocuments(docs) - let size = MemoryLayout.size(ofValue: corpus) - print("Approximate memory footprint: \(size) bytes") + let dataHandler = HNSWCorpusDataHandler(corpus: corpus) + let corpusSize = dataHandler.getCorpusSize() + let dictionarySize = dataHandler.getDictionarySize(includeKey: false) + print("Corpus size: \(corpusSize) bytes") + print("Dictionary size: \(dictionarySize) bytes") do { print("Attempting to query corpus.encodedDocuments.find()...") @@ -204,11 +221,16 @@ final class HNSWTests: XCTestCase { let query = "I love Albert Einstein!" let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) - var corpus = HNSWCorpus(encoder: _documentEncoder) - corpus.addUntokenizedDocuments(docs) +// var corpus = HNSWCorpus(encoder: _documentEncoder) +// corpus.addUntokenizedDocuments(docs) +// +// let dataHandler = HNSWCorpusDataHandler(corpus: corpus) +// let corpusSize = dataHandler.getCorpusSize() +// let dictionarySize = dataHandler.getDictionarySize(includeKey: false) +// print("Corpus size: \(corpusSize) bytes") +// print("Dictionary size: \(dictionarySize) bytes") - let size = MemoryLayout.size(ofValue: corpus) - print("Approximate memory footprint: \(size) bytes") + let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testbiggerexample") do { print("Attempting to query corpus.encodedDocuments.find()...") @@ -229,28 +251,34 @@ final class HNSWTests: XCTestCase { // TODO: Get HNSWCorpus from memory map func testSubredditQueryExample() async throws { - guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else { - fatalError("Failed to find waterloo_submissions.zst in test bundle.") - } - guard let submissionsData = try? Data(contentsOf: submissionsURL) else { - fatalError("Failed to load waterloo_submissions.zst from test bundle.") - } - - let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData) - +// guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else { +// fatalError("Failed to find waterloo_submissions.zst in test bundle.") +// } +// guard let submissionsData = try? Data(contentsOf: submissionsURL) else { +// fatalError("Failed to load waterloo_submissions.zst from test bundle.") +// } +// +// let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData) +// let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) - var corpus = HNSWCorpus(encoder: _documentEncoder) - - for submission in submissions { - if let text = submission.selftext { - corpus.addUntokenizedDocument(text) - } - } +// var corpus = HNSWCorpus(encoder: _documentEncoder) +// +// for submission in submissions { +// if let text = submission.selftext { +// corpus.addUntokenizedDocument(text) +// } +// } + let corpus = HNSWCorpusDataHandler<Double>.loadMemoryMap(encoding: .glove6B50d, resource: "hnsw_testsubreddit") let query = "Mr. Goose is a very important figure at the University of Waterloo." - let size = MemoryLayout.size(ofValue: corpus) - print("Approximate memory footprint: \(size) bytes") + let dataHandler = HNSWCorpusDataHandler(corpus: corpus) + let corpusSize = dataHandler.getCorpusSize() + let dictionarySize = dataHandler.getDictionarySize(includeKey: false) + print("Corpus size: \(corpusSize) bytes") + print("Dictionary size: \(dictionarySize) bytes") + + // Load from memory map here do { print("Attempting to query corpus.encodedDocuments.find()...")