From 7fb12cbb668dcb29a68f99650e1d7e29ab2b59e6 Mon Sep 17 00:00:00 2001 From: a252jain <a252jain@uwaterloo.ca> Date: Fri, 22 Mar 2024 09:27:06 -0400 Subject: [PATCH] broken commit --- Package.resolved | 45 +++ Package.swift | 4 +- .../SwiftNLP/2. Encoding/CoreMLEncoder.swift | 15 +- .../Tokenizers/BertTokenizer.swift | 5 +- .../2. Encoding/all-MiniLM-L6-v2.swift | 306 ------------------ .../SwiftNLP/2. Encoding/generic-model.swift | 140 -------- .../SwiftNLPTests/AllMiniLM_sampleTest.swift | 1 - 7 files changed, 60 insertions(+), 456 deletions(-) delete mode 100644 Sources/SwiftNLP/2. Encoding/all-MiniLM-L6-v2.swift delete mode 100644 Sources/SwiftNLP/2. Encoding/generic-model.swift diff --git a/Package.resolved b/Package.resolved index 187ae88b..377c4a1a 100644 --- a/Package.resolved +++ b/Package.resolved @@ -27,6 +27,15 @@ "version" : "2.2.0" } }, + { + "identity" : "faissmobile", + "kind" : "remoteSourceControl", + "location" : "https://github.com/jkrukowski/FaissMobile", + "state" : { + "revision" : "9d5b9925305eea9398cc92ce4a8e51c8a4b043af", + "version" : "0.0.1" + } + }, { "identity" : "similarity-topology", "kind" : "remoteSourceControl", @@ -36,6 +45,24 @@ "version" : "0.1.14" } }, + { + "identity" : "swift-argument-parser", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-argument-parser", + "state" : { + "revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41", + "version" : "1.3.0" + } + }, + { + "identity" : "swift-log", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-log", + "state" : { + "revision" : "e97a6fcb1ab07462881ac165fdbb37f067e205d5", + "version" : "1.5.4" + } + }, { "identity" : "swift-numerics", "kind" : "remoteSourceControl", @@ -62,6 +89,24 @@ "revision" : "6d90636e22510c2f0798f9f8ff072109e345750a", "version" : "1.1.0" } + }, + { + "identity" : "swiftfaiss", + "kind" : "remoteSourceControl", + "location" : "https://github.com/jkrukowski/SwiftFaiss.git", + "state" : { + "revision" : "d3831c1e9898695ae7f680b6353e48e873d3f1d3", + "version" : "0.0.8" + } + }, + { + "identity" : "swiftformat", + "kind" : "remoteSourceControl", + "location" : "https://github.com/nicklockwood/SwiftFormat", + "state" : { + "revision" : "dbc9a4406d21cc52f16caf1e299172b097145e5e", + "version" : "0.53.3" + } } ], "version" : 2 diff --git a/Package.swift b/Package.swift index db7eef19..45c096b0 100644 --- a/Package.swift +++ b/Package.swift @@ -15,6 +15,7 @@ let package = Package( ], dependencies: [ .package(url: "https://github.com/jbadger3/SwiftAnnoy", .upToNextMajor(from: "1.0.1")), + .package(url: "https://github.com/jkrukowski/SwiftFaiss.git", from: "0.0.7"), .package(url: "https://github.com/L1MeN9Yu/Elva", .upToNextMajor(from: "2.1.3")), .package(url: "https://github.com/JadenGeller/similarity-topology", .upToNextMajor(from: "0.1.14")), ], @@ -29,7 +30,8 @@ let package = Package( ], resources: [ - .process("Resources"), + .process("Resources/bert_vocab.txt"), + .process("Resources/glove.6B.50d.mmap"), ] ), .testTarget( diff --git a/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift b/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift index d073978d..7871e3a8 100644 --- a/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift +++ b/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift @@ -59,9 +59,9 @@ class CoreMLEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder { @available(macOS 13.0, *) public class MiniLMEmbeddings { - private let model: GenericLLMModel + private let model: LLMModel public let tokenizer: BertTokenizer - public let inputDimention: Int = 512 + public let inputDimention: Int = 128 public let outputDimention: Int = 384 public init() { @@ -69,12 +69,12 @@ public class MiniLMEmbeddings { modelConfig.computeUnits = .all do { - self.model = try GenericLLMModel(contentsOf: URL(fileURLWithPath: "Sources/SwiftNLP/Models/all-MiniLM-L6-v2.mlmodelc"), model_name: "all-MiniLM-L6-v2", input_size: inputDimention, output_size: outputDimention) + self.model = try LLMModel<all_MiniLM_L6_v2>() } catch { fatalError("Failed to load the Core ML model. Error: \(error.localizedDescription)") } - self.tokenizer = BertTokenizer() + self.tokenizer = BertTokenizer(maxLen: self.inputDimention) } // MARK: - Dense Embeddings @@ -84,15 +84,18 @@ public class MiniLMEmbeddings { let inputTokens = tokenizer.buildModelTokens(sentence: sentence) let (inputIds, attentionMask) = tokenizer.buildModelInputs(from: inputTokens) + print(inputIds.count, attentionMask.count) + // Send tokens through the MLModel let embeddings = generateEmbeddings(inputIds: inputIds, attentionMask: attentionMask) + print(inputIds.count, attentionMask.count) + return embeddings } public func generateEmbeddings(inputIds: MLMultiArray, attentionMask: MLMultiArray) -> [Float]? { - - let output : GenericLLMModelOutput? = try? model.prediction(input: GenericLLMModelInput(input_ids: inputIds, attention_mask: attentionMask)) + let output : LLMModelOutput? = try? model.prediction(input: LLMModelInput(input_ids: inputIds, attention_mask: attentionMask)) guard let embeddings = output?.embeddings else { return nil } diff --git a/Sources/SwiftNLP/2. Encoding/Tokenizers/BertTokenizer.swift b/Sources/SwiftNLP/2. Encoding/Tokenizers/BertTokenizer.swift index 376cdff5..8c1e01d3 100644 --- a/Sources/SwiftNLP/2. Encoding/Tokenizers/BertTokenizer.swift +++ b/Sources/SwiftNLP/2. Encoding/Tokenizers/BertTokenizer.swift @@ -6,12 +6,12 @@ import CoreML public class BertTokenizer { private let basicTokenizer = BasicTokenizer() private let wordpieceTokenizer: WordpieceTokenizer - private let maxLen = 512 + private var maxLen = 512 private let vocab: [String: Int] private let ids_to_tokens: [Int: String] - public init() { + public init(maxLen: Int) { let url = Bundle.module.url(forResource: "bert_vocab", withExtension: "txt")! let vocabTxt = try! String(contentsOf: url) let tokens = vocabTxt.split(separator: "\n").map { String($0) } @@ -24,6 +24,7 @@ public class BertTokenizer { self.vocab = vocab self.ids_to_tokens = ids_to_tokens self.wordpieceTokenizer = WordpieceTokenizer(vocab: self.vocab) + self.maxLen = maxLen } public func buildModelTokens(sentence: String) -> [Int] { diff --git a/Sources/SwiftNLP/2. Encoding/all-MiniLM-L6-v2.swift b/Sources/SwiftNLP/2. Encoding/all-MiniLM-L6-v2.swift deleted file mode 100644 index c2f0c441..00000000 --- a/Sources/SwiftNLP/2. Encoding/all-MiniLM-L6-v2.swift +++ /dev/null @@ -1,306 +0,0 @@ -// -// all_MiniLM_L6_v2.swift -// -// This file was automatically generated and should not be edited. -// - -import CoreML - - -/// Model Prediction Input Type -@available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, *) -class all_MiniLM_L6_v2Input : MLFeatureProvider { - - /// input_ids as 1 by 512 matrix of floats - var input_ids: MLMultiArray - - /// attention_mask as 1 by 512 matrix of floats - var attention_mask: MLMultiArray - - var featureNames: Set<String> { - get { - return ["input_ids", "attention_mask"] - } - } - - func featureValue(for featureName: String) -> MLFeatureValue? { - if (featureName == "input_ids") { - return MLFeatureValue(multiArray: input_ids) - } - if (featureName == "attention_mask") { - return MLFeatureValue(multiArray: attention_mask) - } - return nil - } - - init(input_ids: MLMultiArray, attention_mask: MLMultiArray) { - self.input_ids = input_ids - self.attention_mask = attention_mask - } - - convenience init(input_ids: MLShapedArray<Float>, attention_mask: MLShapedArray<Float>) { - self.init(input_ids: MLMultiArray(input_ids), attention_mask: MLMultiArray(attention_mask)) - } - -} - - -/// Model Prediction Output Type -@available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, *) -class all_MiniLM_L6_v2Output : MLFeatureProvider { - - /// Source provided by CoreML - private let provider : MLFeatureProvider - - /// embeddings as multidimensional array of floats - var embeddings: MLMultiArray { - return self.provider.featureValue(for: "embeddings")!.multiArrayValue! - } - - /// embeddings as multidimensional array of floats - var embeddingsShapedArray: MLShapedArray<Float> { - return MLShapedArray<Float>(self.embeddings) - } - - var featureNames: Set<String> { - return self.provider.featureNames - } - - func featureValue(for featureName: String) -> MLFeatureValue? { - return self.provider.featureValue(for: featureName) - } - - init(embeddings: MLMultiArray) { - self.provider = try! MLDictionaryFeatureProvider(dictionary: ["embeddings" : MLFeatureValue(multiArray: embeddings)]) - } - - init(features: MLFeatureProvider) { - self.provider = features - } -} - - -/// Class for model loading and prediction -@available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, *) -class all_MiniLM_L6_v2 { - let model: MLModel - - /// URL of model assuming it was installed in the same bundle as this class - class var urlOfModelInThisBundle : URL { - let bundle = Bundle(for: self) - return bundle.url(forResource: "all-MiniLM-L6-v2", withExtension:"mlmodelc")! - } - - /** - Construct all_MiniLM_L6_v2 instance with an existing MLModel object. - - Usually the application does not use this initializer unless it makes a subclass of all_MiniLM_L6_v2. - Such application may want to use `MLModel(contentsOfURL:configuration:)` and `all_MiniLM_L6_v2.urlOfModelInThisBundle` to create a MLModel object to pass-in. - - - parameters: - - model: MLModel object - */ - init(model: MLModel) { - self.model = model - } - - /** - Construct a model with configuration - - - parameters: - - configuration: the desired model configuration - - - throws: an NSError object that describes the problem - */ - convenience init(configuration: MLModelConfiguration = MLModelConfiguration()) throws { - try self.init(contentsOf: type(of:self).urlOfModelInThisBundle, configuration: configuration) - } - - /** - Construct all_MiniLM_L6_v2 instance with explicit path to mlmodelc file - - parameters: - - modelURL: the file url of the model - - - throws: an NSError object that describes the problem - */ - convenience init(contentsOf modelURL: URL) throws { - try self.init(model: MLModel(contentsOf: modelURL)) - } - - /** - Construct a model with URL of the .mlmodelc directory and configuration - - - parameters: - - modelURL: the file url of the model - - configuration: the desired model configuration - - - throws: an NSError object that describes the problem - */ - convenience init(contentsOf modelURL: URL, configuration: MLModelConfiguration) throws { - try self.init(model: MLModel(contentsOf: modelURL, configuration: configuration)) - } - - /** - Construct all_MiniLM_L6_v2 instance asynchronously with optional configuration. - - Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. - - - parameters: - - configuration: the desired model configuration - - handler: the completion handler to be called when the model loading completes successfully or unsuccessfully - */ - class func load(configuration: MLModelConfiguration = MLModelConfiguration(), completionHandler handler: @escaping (Swift.Result<all_MiniLM_L6_v2, Error>) -> Void) { - return self.load(contentsOf: self.urlOfModelInThisBundle, configuration: configuration, completionHandler: handler) - } - - /** - Construct all_MiniLM_L6_v2 instance asynchronously with optional configuration. - - Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. - - - parameters: - - configuration: the desired model configuration - */ - class func load(configuration: MLModelConfiguration = MLModelConfiguration()) async throws -> all_MiniLM_L6_v2 { - return try await self.load(contentsOf: self.urlOfModelInThisBundle, configuration: configuration) - } - - /** - Construct all_MiniLM_L6_v2 instance asynchronously with URL of the .mlmodelc directory with optional configuration. - - Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. - - - parameters: - - modelURL: the URL to the model - - configuration: the desired model configuration - - handler: the completion handler to be called when the model loading completes successfully or unsuccessfully - */ - class func load(contentsOf modelURL: URL, configuration: MLModelConfiguration = MLModelConfiguration(), completionHandler handler: @escaping (Swift.Result<all_MiniLM_L6_v2, Error>) -> Void) { - MLModel.load(contentsOf: modelURL, configuration: configuration) { result in - switch result { - case .failure(let error): - handler(.failure(error)) - case .success(let model): - handler(.success(all_MiniLM_L6_v2(model: model))) - } - } - } - - /** - Construct all_MiniLM_L6_v2 instance asynchronously with URL of the .mlmodelc directory with optional configuration. - - Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread. - - - parameters: - - modelURL: the URL to the model - - configuration: the desired model configuration - */ - class func load(contentsOf modelURL: URL, configuration: MLModelConfiguration = MLModelConfiguration()) async throws -> all_MiniLM_L6_v2 { - let model = try await MLModel.load(contentsOf: modelURL, configuration: configuration) - return all_MiniLM_L6_v2(model: model) - } - - /** - Make a prediction using the structured interface - - - parameters: - - input: the input to the prediction as all_MiniLM_L6_v2Input - - - throws: an NSError object that describes the problem - - - returns: the result of the prediction as all_MiniLM_L6_v2Output - */ - func prediction(input: all_MiniLM_L6_v2Input) throws -> all_MiniLM_L6_v2Output { - return try self.prediction(input: input, options: MLPredictionOptions()) - } - - /** - Make a prediction using the structured interface - - - parameters: - - input: the input to the prediction as all_MiniLM_L6_v2Input - - options: prediction options - - - throws: an NSError object that describes the problem - - - returns: the result of the prediction as all_MiniLM_L6_v2Output - */ - func prediction(input: all_MiniLM_L6_v2Input, options: MLPredictionOptions) throws -> all_MiniLM_L6_v2Output { - let outFeatures = try model.prediction(from: input, options:options) - return all_MiniLM_L6_v2Output(features: outFeatures) - } - - /** - Make an asynchronous prediction using the structured interface - - - parameters: - - input: the input to the prediction as all_MiniLM_L6_v2Input - - options: prediction options - - - throws: an NSError object that describes the problem - - - returns: the result of the prediction as all_MiniLM_L6_v2Output - */ - @available(macOS 14.0, iOS 17.0, tvOS 17.0, watchOS 10.0, *) - func prediction(input: all_MiniLM_L6_v2Input, options: MLPredictionOptions = MLPredictionOptions()) async throws -> all_MiniLM_L6_v2Output { - let outFeatures = try await model.prediction(from: input, options:options) - return all_MiniLM_L6_v2Output(features: outFeatures) - } - - /** - Make a prediction using the convenience interface - - - parameters: - - input_ids as 1 by 512 matrix of floats - - attention_mask as 1 by 512 matrix of floats - - - throws: an NSError object that describes the problem - - - returns: the result of the prediction as all_MiniLM_L6_v2Output - */ - func prediction(input_ids: MLMultiArray, attention_mask: MLMultiArray) throws -> all_MiniLM_L6_v2Output { - let input_ = all_MiniLM_L6_v2Input(input_ids: input_ids, attention_mask: attention_mask) - return try self.prediction(input: input_) - } - - /** - Make a prediction using the convenience interface - - - parameters: - - input_ids as 1 by 512 matrix of floats - - attention_mask as 1 by 512 matrix of floats - - - throws: an NSError object that describes the problem - - - returns: the result of the prediction as all_MiniLM_L6_v2Output - */ - - func prediction(input_ids: MLShapedArray<Float>, attention_mask: MLShapedArray<Float>) throws -> all_MiniLM_L6_v2Output { - let input_ = all_MiniLM_L6_v2Input(input_ids: input_ids, attention_mask: attention_mask) - return try self.prediction(input: input_) - } - - /** - Make a batch prediction using the structured interface - - - parameters: - - inputs: the inputs to the prediction as [all_MiniLM_L6_v2Input] - - options: prediction options - - - throws: an NSError object that describes the problem - - - returns: the result of the prediction as [all_MiniLM_L6_v2Output] - */ - func predictions(inputs: [all_MiniLM_L6_v2Input], options: MLPredictionOptions = MLPredictionOptions()) throws -> [all_MiniLM_L6_v2Output] { - let batchIn = MLArrayBatchProvider(array: inputs) - let batchOut = try model.predictions(from: batchIn, options: options) - var results : [all_MiniLM_L6_v2Output] = [] - results.reserveCapacity(inputs.count) - for i in 0..<batchOut.count { - let outProvider = batchOut.features(at: i) - let result = all_MiniLM_L6_v2Output(features: outProvider) - results.append(result) - } - return results - } -} diff --git a/Sources/SwiftNLP/2. Encoding/generic-model.swift b/Sources/SwiftNLP/2. Encoding/generic-model.swift deleted file mode 100644 index 9c7308cc..00000000 --- a/Sources/SwiftNLP/2. Encoding/generic-model.swift +++ /dev/null @@ -1,140 +0,0 @@ -import CoreML - - -/// Model Prediction Input Type -@available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, *) -class GenericLLMModelInput : MLFeatureProvider { - - var input_ids: MLMultiArray - - var attention_mask: MLMultiArray - - var featureNames: Set<String> { - get { - return ["input_ids", "attention_mask"] - } - } - - func featureValue(for featureName: String) -> MLFeatureValue? { - if (featureName == "input_ids") { - return MLFeatureValue(multiArray: input_ids) - } - if (featureName == "attention_mask") { - return MLFeatureValue(multiArray: attention_mask) - } - return nil - } - - init(input_ids: MLMultiArray, attention_mask: MLMultiArray) { - self.input_ids = input_ids - self.attention_mask = attention_mask - } - - convenience init(input_ids: MLShapedArray<Float>, attention_mask: MLShapedArray<Float>) { - self.init(input_ids: MLMultiArray(input_ids), attention_mask: MLMultiArray(attention_mask)) - } -} - -/// Model Prediction Output Type -@available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, *) -class GenericLLMModelOutput : MLFeatureProvider { - - private let provider : MLFeatureProvider - - var embeddings: MLMultiArray { - return self.provider.featureValue(for: "embeddings")!.multiArrayValue! - } - - var embeddingsShapedArray: MLShapedArray<Float> { - return MLShapedArray<Float>(self.embeddings) - } - - var featureNames: Set<String> { - return self.provider.featureNames - } - - func featureValue(for featureName: String) -> MLFeatureValue? { - return self.provider.featureValue(for: featureName) - } - - init(embeddings: MLMultiArray) { - self.provider = try! MLDictionaryFeatureProvider(dictionary: ["embeddings" : MLFeatureValue(multiArray: embeddings)]) - } - - init(features: MLFeatureProvider) { - self.provider = features - } -} - - -/// Class for model loading and prediction -@available(macOS 12.0, iOS 15.0, tvOS 15.0, watchOS 8.0, *) -class GenericLLMModel { - let model: MLModel - let model_name: String - let input_size: Int - let output_size: Int - - init(model: MLModel, model_name: String, input_size: Int, output_size: Int) { - self.model = model - self.model_name = model_name - self.input_size = input_size - self.output_size = output_size - } - - convenience init(contentsOf modelURL: URL, model_name: String, input_size: Int, output_size: Int) throws { - try self.init(model: MLModel(contentsOf: modelURL), model_name: model_name, input_size: input_size, output_size: output_size) - } - - convenience init(contentsOf modelURL: URL, configuration: MLModelConfiguration, model_name: String, input_size: Int, output_size: Int) throws { - try self.init(model: MLModel(contentsOf: modelURL, configuration: configuration), model_name: model_name, input_size: input_size, output_size: output_size) - } - - class func load(contentsOf modelURL: URL, model_name: String, input_size: Int, output_size: Int, configuration: MLModelConfiguration = MLModelConfiguration()) async throws -> GenericLLMModel { - let model = try await MLModel.load(contentsOf: modelURL, configuration: configuration) - return GenericLLMModel(model: model, model_name: model_name, input_size: input_size, output_size: output_size) - } - - func prediction(input: GenericLLMModelInput) throws -> GenericLLMModelOutput { - return try self.prediction(input: input, options: MLPredictionOptions()) - } - - func prediction(input: GenericLLMModelInput, options: MLPredictionOptions) throws -> GenericLLMModelOutput { - let outFeatures = try model.prediction(from: input, options:options) - return GenericLLMModelOutput(features: outFeatures) - } - - @available(macOS 14.0, iOS 17.0, tvOS 17.0, watchOS 10.0, *) - func prediction(input: GenericLLMModelInput, options: MLPredictionOptions = MLPredictionOptions()) async throws -> GenericLLMModelOutput { - let outFeatures = try await model.prediction(from: input, options:options) - return GenericLLMModelOutput(features: outFeatures) - } - - func prediction(input_ids: MLMultiArray, attention_mask: MLMultiArray) throws -> GenericLLMModelOutput { - let input_ = GenericLLMModelInput(input_ids: input_ids, attention_mask: attention_mask) - return try self.prediction(input: input_) - } - - func prediction(input_ids: MLShapedArray<Float>, attention_mask: MLShapedArray<Float>) throws -> GenericLLMModelOutput { - let input_ = GenericLLMModelInput(input_ids: input_ids, attention_mask: attention_mask) - return try self.prediction(input: input_) - } - - func predictions(inputs: [GenericLLMModelInput], options: MLPredictionOptions = MLPredictionOptions()) throws -> [GenericLLMModelOutput] { - for input in inputs { - assert(input.input_ids.count == self.input_size) - } - - let batchIn = MLArrayBatchProvider(array: inputs) - let batchOut = try model.predictions(from: batchIn, options: options) - var results : [GenericLLMModelOutput] = [] - results.reserveCapacity(inputs.count) - for i in 0..<batchOut.count { - let outProvider = batchOut.features(at: i) - let result = GenericLLMModelOutput(features: outProvider) - results.append(result) - } - - return results - } -} diff --git a/Tests/SwiftNLPTests/AllMiniLM_sampleTest.swift b/Tests/SwiftNLPTests/AllMiniLM_sampleTest.swift index 8e837333..64e401f1 100644 --- a/Tests/SwiftNLPTests/AllMiniLM_sampleTest.swift +++ b/Tests/SwiftNLPTests/AllMiniLM_sampleTest.swift @@ -35,7 +35,6 @@ final class BERT_test: XCTestCase { var embedding_dim: Int = 384 var model = MiniLMEmbeddings() - query_embedding = await model.encode(sentence: query[0])! -- GitLab