Skip to content
Snippets Groups Projects
Commit 379a7aaa authored by Mingchung Xia's avatar Mingchung Xia
Browse files

Added basic data handler for mmap

parent e34f6e70
No related branches found
No related tags found
1 merge request!13HNSW Implementation with Testcases
Pipeline #111792 failed
...@@ -8,8 +8,7 @@ ...@@ -8,8 +8,7 @@
import Foundation import Foundation
final class HNSWCorpusDataHandler<Scalar: BinaryFloatingPoint & Codable> { final class HNSWCorpusDataHandler<Scalar: BinaryFloatingPoint & Codable> {
private var corpus: HNSWCorpus<Scalar> var corpus: HNSWCorpus<Scalar>
private var mmapURL: URL? // set default URL
init(corpus: HNSWCorpus<Scalar>) { init(corpus: HNSWCorpus<Scalar>) {
self.corpus = corpus self.corpus = corpus
...@@ -17,11 +16,55 @@ final class HNSWCorpusDataHandler<Scalar: BinaryFloatingPoint & Codable> { ...@@ -17,11 +16,55 @@ final class HNSWCorpusDataHandler<Scalar: BinaryFloatingPoint & Codable> {
} }
extension HNSWCorpusDataHandler { extension HNSWCorpusDataHandler {
func saveMemoryMap() { func saveMemoryMap(url: URL) {
let fileManager = FileManager.default
if !fileManager.fileExists(atPath: url.path) {
fileManager.createFile(atPath: url.path, contents: nil, attributes: nil)
}
do {
let fileHandle = try FileHandle(forWritingTo: url)
let count = corpus.count
let countData = withUnsafeBytes(of: count) { Data($0) }
fileHandle.write(countData)
// TODO: We may need to edit the HNSWCorpus iterator to actually iterate over its dictionary as it would be useful here
let data = corpus.getDictionary()
for (key, documentVectorPair) in data {
let documentData = documentVectorPair.untokenizedDocument.utf8CString.withUnsafeBufferPointer { Data(buffer: $0) }
fileHandle.write(documentData)
}
fileHandle.closeFile()
} catch {
print("Error writing HNSW to file: \(error)")
}
} }
func loadMemoryMap() { // TODO: Change the return from Double to Scalar
func loadMemoryMap(url: URL, encoder: any SNLPEncoder) -> HNSWCorpus<Double> {
var loadedCorpus = HNSWCorpus(encoder: encoder)
do {
let data = try Data(contentsOf: url, options: .alwaysMapped)
let countData = data.prefix(MemoryLayout<Int>.size)
let count: Int = countData.withUnsafeBytes { $0.load(as: Int.self) }
var index = MemoryLayout<Int>.size
for _ in 0..<count {
if let stringRange = data[index...].range(of: "\0".data(using: .utf8)!) {
let documentData = data[index..<stringRange.lowerBound]
if let document = String(data: documentData, encoding: .utf8) {
// Add the untokenized document to the corpus
loadedCorpus.addUntokenizedDocument(document)
index = stringRange.upperBound
}
} else {
break
}
}
} catch {
print("Error reading HNSW from file: \(error)")
}
return loadedCorpus
} }
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment