From a0dd924c7354b90afdd95e088c76ea8d7f6de70a Mon Sep 17 00:00:00 2001 From: Mingchung Xia <mingchung.xia@gmail.com> Date: Wed, 14 Feb 2024 00:07:48 -0500 Subject: [PATCH] Reverted ContextFreeEncoder changes --- .../ContextFreeEncoder + File IO .swift | 91 ------------------- .../2. Encoding/ContextFreeEncoder.swift | 8 -- 2 files changed, 99 deletions(-) diff --git a/Sources/SwiftNLP/2. Encoding/ContextFreeEncoder + File IO .swift b/Sources/SwiftNLP/2. Encoding/ContextFreeEncoder + File IO .swift index 7113b9ce..cd5b35ef 100644 --- a/Sources/SwiftNLP/2. Encoding/ContextFreeEncoder + File IO .swift +++ b/Sources/SwiftNLP/2. Encoding/ContextFreeEncoder + File IO .swift @@ -134,95 +134,4 @@ extension ContextFreeEncoder { return nil } } - - static func writeHNSWToFile(url: URL, hnsw: DeterministicSampleVectorIndex<[Scalar]>) { - let fileManager = FileManager.default - if !fileManager.fileExists(atPath: url.path) { - fileManager.createFile(atPath: url.path, contents: nil, attributes: nil) - } - - do { - let fileHandle = try FileHandle(forWritingTo: url) - - let count = hnsw.base.vectors.count - let countData = withUnsafeBytes(of: count) { Data($0) } - fileHandle.write(countData) - - for vector in hnsw.base.vectors { - let vectorData = vector.map { Float($0) }.withUnsafeBufferPointer { Data(buffer: $0) } - fileHandle.write(vectorData) - } - - fileHandle.closeFile() - } catch { - print("Error writing HNSW to file: \(error)") - } - } - - static func readHNSWFromFile(_ url: URL) -> DeterministicSampleVectorIndex<[Scalar]> { - do { - let data = try Data(contentsOf: url, options: .alwaysMapped) - let countData = data.prefix(MemoryLayout<Int>.size) - let count: Int = countData.withUnsafeBytes { $0.load(as: Int.self) } - var index = MemoryLayout<Int>.size - - var hnsw = DeterministicSampleVectorIndex<[Scalar]>() - -// while index < data.count { -// if let stringRange = data[index...].range(of: "\0".data(using: .utf8)!) { -// let keyData = data[index..<stringRange.lowerBound] -// if let key = String(data: keyData, encoding: .utf8) { -// index = stringRange.upperBound -// -// let valuesData = data[index..<(index + 50 * MemoryLayout<Scalar>.size)] -// let values = valuesData.withUnsafeBytes { Array($0.bindMemory(to: Scalar.self)) } -// hnsw.insert(values) -// } -// } else { -// break -// } -// } - - for _ in 0..<count { - let vectorData = data[index..<(index + 50 * MemoryLayout<Scalar>.size)] - let vector = vectorData.withUnsafeBytes { Array($0.bindMemory(to: Scalar.self)) } - hnsw.insert(vector) - index += 50 * MemoryLayout<Scalar>.size - } - - return hnsw - } catch { - print("Error reading HNSW from file: \(error)") - } - return DeterministicSampleVectorIndex<[Scalar]>() - } - - static func readHNSWFromTextFile(from url: URL) -> DeterministicSampleVectorIndex<[Scalar]>? { - do { - let content = try String(contentsOf: url, encoding: .utf8) - let lines = content.split(separator: "\n") - -// var data = DeterministicSampleVectorIndex<[Scalar]>() -// -// for line in lines.dropFirst() { -// let tokens = line.split(separator: " ") -// let word = String(tokens[0]) -// let vector = tokens.dropFirst().compactMap { Scalar(Double($0)!) } -// data.insert(vector) -// } -// -// return data - - var hnsw = DeterministicSampleVectorIndex<[Scalar]>() - for line in lines { - let vector = line.split(separator: " ").compactMap { Scalar(Double($0)!) } - hnsw.insert(vector) - } - - return hnsw - } catch { - print("Error loading vectors from text file: \(error)") - return nil - } - } } diff --git a/Sources/SwiftNLP/2. Encoding/ContextFreeEncoder.swift b/Sources/SwiftNLP/2. Encoding/ContextFreeEncoder.swift index 295c144d..f623a512 100644 --- a/Sources/SwiftNLP/2. Encoding/ContextFreeEncoder.swift +++ b/Sources/SwiftNLP/2. Encoding/ContextFreeEncoder.swift @@ -26,7 +26,6 @@ import Foundation class ContextFreeEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder { var dictionary: [String : [Scalar]] - var hnsw: DeterministicSampleVectorIndex<[Scalar]> let width: Int var zeroes: [Scalar] @@ -35,14 +34,12 @@ class ContextFreeEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder { public enum PreComputedEmbeddings { case glove6B50d case glove6B100d - case hnswindex //case NLEmbedding } init(source: PreComputedEmbeddings) { dictionary = Dictionary<String,[Scalar]>() - hnsw = DeterministicSampleVectorIndex<[Scalar]>() var dictionaryToLoad: String switch source { @@ -53,10 +50,6 @@ class ContextFreeEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder { case .glove6B100d: width = 100 dictionaryToLoad = "glove.6B.100d" - - case .hnswindex: - width = 50 // double check the dimension for HNSW - dictionaryToLoad = "hnswindex" } zeroes = Array(repeating: Scalar(0), count: width) as! [Scalar] @@ -69,7 +62,6 @@ class ContextFreeEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder { return } dictionary = ContextFreeEncoder<Scalar>.readDictionaryFromFile(url) -// hnsw = ContextFreeEncoder<Scalar>.readHNSWFromFile(url) } subscript(_ token: String) -> [Scalar] { -- GitLab