Skip to content
Snippets Groups Projects
Commit 58914b3a authored by Mingchung Xia's avatar Mingchung Xia
Browse files

Persistent LMDB readings past one lifecycle

parent 02257135
No related branches found
No related tags found
1 merge request!13HNSW Implementation with Testcases
Pipeline #113955 failed
//
// DurableHNSWCorpus + File IO.swift
//
//
// Created by Mingchung Xia on 2024-03-12.
//
import Foundation
// MARK: This extension for saving and loading the memory map data of untokenized documents is currently used as a workaround
// This is because loading the memory mapped data using CoreLMDB does not load the untokenized documents (and the other fields) of a DurableHNSWCorpus so in order to write and read from disk of the original data, we need to have this workaround
// Eventually, all this code in this extension should be moved to the HNSWCorpusDataHandler after a general wrapper class for DurableHNSW and EmphemeralHNSW is made
extension DurableHNSWCorpus {
/// Saves untokenized documents in dictionary to disk to Downloads directory
func saveDictionaryToDownloads(fileName: String) {
guard let downloadsURL = FileManager.default.urls(for: .downloadsDirectory, in: .userDomainMask).first else {
print("Could not find Downloads directory")
return
}
let fileURL = downloadsURL.appendingPathComponent(fileName)
saveDictionaryMemoryMap(url: fileURL)
}
func saveDictionaryMemoryMap(url: URL) {
let fileManager = FileManager.default
if !fileManager.fileExists(atPath: url.path) {
fileManager.createFile(atPath: url.path, contents: nil, attributes: nil)
}
do {
let fileHandle = try FileHandle(forWritingTo: url)
let count = dictionary.count
let countData = withUnsafeBytes(of: count) { Data($0) }
fileHandle.write(countData)
for (key, value) in dictionary {
let keyData = withUnsafeBytes(of: key) { Data($0) }
fileHandle.write(keyData)
// Convert the untokenizedDocument (String) to Data
let documentData = value.untokenizedDocument.data(using: .utf8) ?? Data()
// Prefix the document data with its length to know how much to read when loading
let documentLengthData = withUnsafeBytes(of: documentData.count) { Data($0) }
fileHandle.write(documentLengthData)
fileHandle.write(documentData)
// Convert the vector ([Double]) to Data
let vectorData = value.vector.withUnsafeBytes { Data($0) }
// Prefix the vector data with its length to know how much to read when loading
let vectorLengthData = withUnsafeBytes(of: value.vector.count) { Data($0) }
fileHandle.write(vectorLengthData)
fileHandle.write(vectorData)
}
fileHandle.closeFile()
} catch {
print("Error writing dictionary to file: \(error)")
}
}
static func readDictionaryFromDownloads(fileName: String, width: Int = 50) -> HNSWDictionary {
guard let downloadsURL = FileManager.default.urls(for: .downloadsDirectory, in: .userDomainMask).first else {
print("Could not find Downloads directory")
return [:]
}
let fileURL = downloadsURL.appendingPathComponent(fileName)
return readDictionaryMemoryMap(fileURL, width: width)
}
/// Width is the number of dimensions of the glove encoding
// TODO: Improve this to not need to take in a width, rather switch between the encoding / encoder
static func readDictionaryMemoryMap(_ url: URL, width: Int = 50) -> HNSWDictionary {
var dictionary = HNSWDictionary()
do {
let data = try Data(contentsOf: url)
var index = 0
// Safely read the dictionary count
let countData = data.subdata(in: index..<index+MemoryLayout<Int>.size)
let count = countData.withUnsafeBytes { $0.load(as: Int.self) }
index += MemoryLayout<Int>.size
for _ in 0..<count {
// Safely read the key
let keyData = data.subdata(in: index..<index+MemoryLayout<Int>.size)
let key = keyData.withUnsafeBytes { $0.load(as: Int.self) }
index += MemoryLayout<Int>.size
// Read the document length and document
let documentLengthData = data.subdata(in: index..<index+MemoryLayout<Int>.size)
let documentLength = documentLengthData.withUnsafeBytes { $0.load(as: Int.self) }
index += MemoryLayout<Int>.size
let documentData = data.subdata(in: index..<index+documentLength)
guard let document = String(data: documentData, encoding: .utf8) else {
print("Failed to decode string")
continue // Skip this entry on failure
}
index += documentLength
// Read the vector
let vectorLengthData = data.subdata(in: index..<index+MemoryLayout<Int>.size)
let vectorLength = vectorLengthData.withUnsafeBytes { $0.load(as: Int.self) }
index += MemoryLayout<Int>.size
var vector = [Double]()
for _ in 0..<vectorLength {
let vectorElementData = data.subdata(in: index..<index+MemoryLayout<Double>.size)
let vectorElement = vectorElementData.withUnsafeBytes { $0.load(as: Double.self) }
vector.append(vectorElement)
index += MemoryLayout<Double>.size
}
// Add the key-value pair to the dictionary
dictionary[key] = DocumentVectorPair(untokenizedDocument: document, vector: vector)
}
} catch {
print("Error reading dictionary from file: \(error)")
}
return dictionary
}
}
...@@ -84,6 +84,11 @@ extension HNSWCorpusDataHandler { ...@@ -84,6 +84,11 @@ extension HNSWCorpusDataHandler {
} }
} }
/// This saves only the untokenized documents dictionary map
func saveDictionaryMemoryMap() {
// TODO: Move from DurableHNSW extension once HNSW wrapper is created
}
// TODO: find out how to not rebuild the index // TODO: find out how to not rebuild the index
static func loadMemoryMap(encoder: any SNLPEncoder, typicalNeighborhoodSize: Int = 20, resource: String = "hnsw") -> HNSWCorpus<Double> { static func loadMemoryMap(encoder: any SNLPEncoder, typicalNeighborhoodSize: Int = 20, resource: String = "hnsw") -> HNSWCorpus<Double> {
guard let url = Bundle.module.url(forResource: resource, withExtension: "mmap") else { guard let url = Bundle.module.url(forResource: resource, withExtension: "mmap") else {
...@@ -127,4 +132,8 @@ extension HNSWCorpusDataHandler { ...@@ -127,4 +132,8 @@ extension HNSWCorpusDataHandler {
let encoder = ContextFreeEncoder<Scalar>(source: encoding) let encoder = ContextFreeEncoder<Scalar>(source: encoding)
return loadMemoryMap(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize, resource: resource) return loadMemoryMap(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize, resource: resource)
} }
static func loadDictionaryMemoryMap() {
// TODO: Move from DurableHNSW extension once HNSW wrapper is created
}
} }
...@@ -282,15 +282,18 @@ final class HNSWTests: XCTestCase { ...@@ -282,15 +282,18 @@ final class HNSWTests: XCTestCase {
let transaction = try Transaction.begin(.write, in: env) let transaction = try Transaction.begin(.write, in: env)
var corpus = try DurableHNSWCorpus( /// Saving the memory map to disk
encoder: _documentEncoder, // var corpus = try DurableHNSWCorpus(
namespace: "testbasicqueryexampledurable", // encoder: _documentEncoder,
in: transaction // namespace: "testbasicqueryexampledurable",
) // in: transaction
// )
for doc in docs { //
try corpus.addUntokenizedDocument(doc, in: transaction) // for doc in docs {
} // try corpus.addUntokenizedDocument(doc, in: transaction)
// }
//
// corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
try transaction.commit() try transaction.commit()
...@@ -299,6 +302,7 @@ final class HNSWTests: XCTestCase { ...@@ -299,6 +302,7 @@ final class HNSWTests: XCTestCase {
let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) } let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
/// Reading the memory map (and dictionary) from disk
let readTransaction = try Transaction.begin(.write, in: env) let readTransaction = try Transaction.begin(.write, in: env)
let readCorpus = try DurableHNSWCorpus( let readCorpus = try DurableHNSWCorpus(
...@@ -307,7 +311,8 @@ final class HNSWTests: XCTestCase { ...@@ -307,7 +311,8 @@ final class HNSWTests: XCTestCase {
in: readTransaction in: readTransaction
) )
readCorpus.dictionary = corpus.getDictionary() // FIXME: don't copy over dictionary // readCorpus.dictionary = corpus.getDictionary() // FIXME: don't copy over dictionary
readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap")
// do not add documents here! // do not add documents here!
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment