Abhinav Jain · f7686c69 · d78f83b6 · a5c2ed64 · 77ae4960 · 7459a06d
--- a/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift

+ 86

− 56
+++ b/Sources/SwiftNLP/2. Encoding/CoreMLEncoder.swift

+ 86

− 56
 @@ -26,69 +26,99 @@ import Foundation
 import CoreML


-struct CoreMLEncoder<Scalar: BinaryFloatingPoint>: SNLPEncoder {
+@freestanding(expression)
+public macro MODEL_MAKE_PREDICTION(_ input_name: Any, _ attention_ids: Any, _ output_name: Any) = #externalMacro(
+    module: "SwiftNLPGenericLLMMacros",
+    type: "LLMModelPredictionCases")
+
+@freestanding(expression)
+public macro MODEL_VALIDATE_NAME_AND_SET_INPUT_SIZE() = #externalMacro(
+    module: "SwiftNLPGenericLLMMacros",
+    type: "LLMModelNameValidation")
+
+
+class CoreMLEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder {
+    
+    
    
-    var zeroes: [Scalar] = []
-    var dimensions: UInt = 0
+    var zeroes: [Scalar]
+    var dimensions: UInt
+    var model: String
        
+    required init() {
+      zeroes = Array(repeating: Scalar(0), count: 384)
+      dimensions = 384
+      model = "all_MiniLM_L6_v2"
+    }
+    
+    
    func encodeToken(_ token: String) -> [Scalar] {
-        fatalError("CoreMLEncoder not implemented yet. Get on it.")
+        let tokenization = LLMEmbeddings(model_type: self.model).tokenizer.tokenizeToIds(text: token) as! [Scalar]
+        return tokenization
    }
    
    func encodeSentence(_ sentence: String) -> [Scalar] {
-        fatalError("CoreMLEncoder not implemented yet. Get on it.")
+        let encoding = Task {
+            await LLMEmbeddings(model_type: self.model).encode(sentence: sentence)
+        } as! [Scalar]
+        return encoding
    }
 }

-//@available(macOS 13.0, *)
-//public class MiniLMEmbeddings {
-//    public let model: all_MiniLM_L6_v2
-//    public let tokenizer: BertTokenizer
-//    public let inputDimention: Int = 512
-//    public let outputDimention: Int = 384
-//
-//    public init() {
-//        let modelConfig = MLModelConfiguration()
-//        modelConfig.computeUnits = .all
-//
-//        do {
-//            self.model = try all_MiniLM_L6_v2(configuration: modelConfig)
-//        } catch {
-//            fatalError("Failed to load the Core ML model. Error: \(error.localizedDescription)")
-//        }
-//
-//        self.tokenizer = BertTokenizer()
-//    }
-//
-//    // MARK: - Dense Embeddings
-//
-//    public func encode(sentence: String) async -> [Float]? {
-//        // Encode input text as bert tokens
-//        let inputTokens = tokenizer.buildModelTokens(sentence: sentence)
-//        let (inputIds, attentionMask) = tokenizer.buildModelInputs(from: inputTokens)
-//
-//        // Send tokens through the MLModel
-//        let embeddings = generateEmbeddings(inputIds: inputIds, attentionMask: attentionMask)
-//
-//        return embeddings
-//    }
-//
-//    public func generateEmbeddings(inputIds: MLMultiArray, attentionMask: MLMultiArray) -> [Float]? {
-//        let inputFeatures = all_MiniLM_L6_v2Input(input_ids: inputIds, attention_mask: attentionMask)
-//        
-//        let output = try? model.prediction(input: inputFeatures)
-//        guard let embeddings = output?.embeddings else {
-//            return nil
-//        }
-//        
-//        var embeddingsArray = [Float]()
-//        for index in 0..<embeddings.count {
-//            let value = embeddings[index].floatValue
-//            embeddingsArray.append(Float(value))
-//        }
-//        
-//        return embeddingsArray
-//    }
-//
-//}
+
+@available(macOS 13.0, *)
+public class LLMEmbeddings {
+    
+    private let model: String
+    public var tokenizer: BertTokenizer
+    public var inputDimention: Int = 512 // 512 is a dummy value, correct value is set by the macro below
+    public let outputDimention: Int = 384
+
+    public init(model_type: String) {
+        let modelConfig = MLModelConfiguration()
+        modelConfig.computeUnits = .all
+
+        self.model = model_type;
+        
+        // dummy initialization needed here to avoid compilation error
+        self.tokenizer = BertTokenizer(maxLen: self.inputDimention)
+        
+        // validate the model type is valid and set the correct input dimension
+        #MODEL_VALIDATE_NAME_AND_SET_INPUT_SIZE()
+        
+        // reinitialize with correct input size
+        self.tokenizer = BertTokenizer(maxLen: self.inputDimention)
+    }
+
+    public func encode(sentence: String) async -> [Float]? {
+        // Encode input text as bert tokens
+        let inputTokens = tokenizer.buildModelTokens(sentence: sentence)
+        let (inputIds, attentionMask) = tokenizer.buildModelInputs(from: inputTokens)
+
+        let embeddings = generateEmbeddings(inputIds: inputIds, attentionMask: attentionMask)
+
+        return embeddings
+    }
+
+    public func generateEmbeddings(inputIds: MLMultiArray, attentionMask: MLMultiArray) -> [Float]? {
+        var output: MLMultiArray? = nil
+        
+        // determine which model to use and generate predictions
+        #MODEL_MAKE_PREDICTION("inputIds", "attentionMask", "output")
+        
+        if (output === nil) {
+            return nil;
+        }
+        
+        let embeddings = output!;
+       
+        var embeddingsArray = [Float]()
+        for index in 0..<embeddings.count {
+            let value = embeddings[index].floatValue
+            embeddingsArray.append(Float(value))
+        }
+       
+        return embeddingsArray
+    }
+}
 #endif