diff --git a/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + File IO.swift b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + File IO.swift new file mode 100644 index 0000000000000000000000000000000000000000..d24a552efbdf3012ddcf46f3f969f6dca3268172 --- /dev/null +++ b/Sources/SwiftNLP/1. Data Collection/DurableHNSWCorpus + File IO.swift @@ -0,0 +1,129 @@ +// +// DurableHNSWCorpus + File IO.swift +// +// +// Created by Mingchung Xia on 2024-03-12. +// + +import Foundation + +// MARK: This extension for saving and loading the memory map data of untokenized documents is currently used as a workaround +// This is because loading the memory mapped data using CoreLMDB does not load the untokenized documents (and the other fields) of a DurableHNSWCorpus so in order to write and read from disk of the original data, we need to have this workaround +// Eventually, all this code in this extension should be moved to the HNSWCorpusDataHandler after a general wrapper class for DurableHNSW and EmphemeralHNSW is made + +extension DurableHNSWCorpus { + /// Saves untokenized documents in dictionary to disk to Downloads directory + func saveDictionaryToDownloads(fileName: String) { + guard let downloadsURL = FileManager.default.urls(for: .downloadsDirectory, in: .userDomainMask).first else { + print("Could not find Downloads directory") + return + } + + let fileURL = downloadsURL.appendingPathComponent(fileName) + + saveDictionaryMemoryMap(url: fileURL) + } + + func saveDictionaryMemoryMap(url: URL) { + let fileManager = FileManager.default + if !fileManager.fileExists(atPath: url.path) { + fileManager.createFile(atPath: url.path, contents: nil, attributes: nil) + } + + do { + let fileHandle = try FileHandle(forWritingTo: url) + + let count = dictionary.count + let countData = withUnsafeBytes(of: count) { Data($0) } + fileHandle.write(countData) + + for (key, value) in dictionary { + let keyData = withUnsafeBytes(of: key) { Data($0) } + fileHandle.write(keyData) + + // Convert the untokenizedDocument (String) to Data + let documentData = value.untokenizedDocument.data(using: .utf8) ?? Data() + // Prefix the document data with its length to know how much to read when loading + let documentLengthData = withUnsafeBytes(of: documentData.count) { Data($0) } + fileHandle.write(documentLengthData) + fileHandle.write(documentData) + + // Convert the vector ([Double]) to Data + let vectorData = value.vector.withUnsafeBytes { Data($0) } + // Prefix the vector data with its length to know how much to read when loading + let vectorLengthData = withUnsafeBytes(of: value.vector.count) { Data($0) } + fileHandle.write(vectorLengthData) + fileHandle.write(vectorData) + } + + fileHandle.closeFile() + } catch { + print("Error writing dictionary to file: \(error)") + } + } + + static func readDictionaryFromDownloads(fileName: String, width: Int = 50) -> HNSWDictionary { + guard let downloadsURL = FileManager.default.urls(for: .downloadsDirectory, in: .userDomainMask).first else { + print("Could not find Downloads directory") + return [:] + } + + let fileURL = downloadsURL.appendingPathComponent(fileName) + + return readDictionaryMemoryMap(fileURL, width: width) + } + + /// Width is the number of dimensions of the glove encoding + // TODO: Improve this to not need to take in a width, rather switch between the encoding / encoder + static func readDictionaryMemoryMap(_ url: URL, width: Int = 50) -> HNSWDictionary { + var dictionary = HNSWDictionary() + + do { + let data = try Data(contentsOf: url) + var index = 0 + + // Safely read the dictionary count + let countData = data.subdata(in: index..<index+MemoryLayout<Int>.size) + let count = countData.withUnsafeBytes { $0.load(as: Int.self) } + index += MemoryLayout<Int>.size + + for _ in 0..<count { + // Safely read the key + let keyData = data.subdata(in: index..<index+MemoryLayout<Int>.size) + let key = keyData.withUnsafeBytes { $0.load(as: Int.self) } + index += MemoryLayout<Int>.size + + // Read the document length and document + let documentLengthData = data.subdata(in: index..<index+MemoryLayout<Int>.size) + let documentLength = documentLengthData.withUnsafeBytes { $0.load(as: Int.self) } + index += MemoryLayout<Int>.size + let documentData = data.subdata(in: index..<index+documentLength) + guard let document = String(data: documentData, encoding: .utf8) else { + print("Failed to decode string") + continue // Skip this entry on failure + } + index += documentLength + + // Read the vector + let vectorLengthData = data.subdata(in: index..<index+MemoryLayout<Int>.size) + let vectorLength = vectorLengthData.withUnsafeBytes { $0.load(as: Int.self) } + index += MemoryLayout<Int>.size + + var vector = [Double]() + for _ in 0..<vectorLength { + let vectorElementData = data.subdata(in: index..<index+MemoryLayout<Double>.size) + let vectorElement = vectorElementData.withUnsafeBytes { $0.load(as: Double.self) } + vector.append(vectorElement) + index += MemoryLayout<Double>.size + } + + // Add the key-value pair to the dictionary + dictionary[key] = DocumentVectorPair(untokenizedDocument: document, vector: vector) + } + } catch { + print("Error reading dictionary from file: \(error)") + } + + return dictionary + } +} diff --git a/Sources/SwiftNLP/1. Data Collection/HNSW/HNSWCorpusDataHandler.swift b/Sources/SwiftNLP/1. Data Collection/HNSW/HNSWCorpusDataHandler.swift index 7911ebbf91445c07b333d67ba336615ea75f50f9..88e95ff0437d0f1b62ca074f94a804f22216adfd 100644 --- a/Sources/SwiftNLP/1. Data Collection/HNSW/HNSWCorpusDataHandler.swift +++ b/Sources/SwiftNLP/1. Data Collection/HNSW/HNSWCorpusDataHandler.swift @@ -84,6 +84,11 @@ extension HNSWCorpusDataHandler { } } + /// This saves only the untokenized documents dictionary map + func saveDictionaryMemoryMap() { + // TODO: Move from DurableHNSW extension once HNSW wrapper is created + } + // TODO: find out how to not rebuild the index static func loadMemoryMap(encoder: any SNLPEncoder, typicalNeighborhoodSize: Int = 20, resource: String = "hnsw") -> HNSWCorpus<Double> { guard let url = Bundle.module.url(forResource: resource, withExtension: "mmap") else { @@ -127,4 +132,8 @@ extension HNSWCorpusDataHandler { let encoder = ContextFreeEncoder<Scalar>(source: encoding) return loadMemoryMap(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize, resource: resource) } + + static func loadDictionaryMemoryMap() { + // TODO: Move from DurableHNSW extension once HNSW wrapper is created + } } diff --git a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift index cf5afed4c75ba3f9ffa67fe7c93ab4554a2609b7..eb7919a977af196985431d2a83c7ac3dc1b24f04 100644 --- a/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift +++ b/Tests/SwiftNLPTests/2. Encoding/HNSWTests.swift @@ -282,15 +282,18 @@ final class HNSWTests: XCTestCase { let transaction = try Transaction.begin(.write, in: env) - var corpus = try DurableHNSWCorpus( - encoder: _documentEncoder, - namespace: "testbasicqueryexampledurable", - in: transaction - ) - - for doc in docs { - try corpus.addUntokenizedDocument(doc, in: transaction) - } + /// Saving the memory map to disk +// var corpus = try DurableHNSWCorpus( +// encoder: _documentEncoder, +// namespace: "testbasicqueryexampledurable", +// in: transaction +// ) +// +// for doc in docs { +// try corpus.addUntokenizedDocument(doc, in: transaction) +// } +// +// corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap") try transaction.commit() @@ -299,6 +302,7 @@ final class HNSWTests: XCTestCase { let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) } + /// Reading the memory map (and dictionary) from disk let readTransaction = try Transaction.begin(.write, in: env) let readCorpus = try DurableHNSWCorpus( @@ -307,7 +311,8 @@ final class HNSWTests: XCTestCase { in: readTransaction ) - readCorpus.dictionary = corpus.getDictionary() // FIXME: don't copy over dictionary +// readCorpus.dictionary = corpus.getDictionary() // FIXME: don't copy over dictionary + readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") // do not add documents here!