Tidy up SNLP Protocols

8d6c3c05 · Jim Wallace · bd7aeb1c · 8d6c3c05 · 8d6c3c05 · 8d6c3c05
Commit 8d6c3c05 authored 1 year ago by Jim Wallace
--- a/Sources/SwiftNLP/0. SNLP Internal Protocols/SNLPCorpus + Text Processing.swift
+++ b/Sources/SwiftNLP/0. SNLP Internal Protocols/SNLPCorpus + Text Processing.swift
+import Foundation
+
+@available(macOS 10.15.0, *)
+extension SNLPCorpus {
+    static func applyBasicTextProcessing(_ text: String, characterFilters: CharacterSet, tokenFilters: Set<String>) -> [String] {
+        let lowercasedText = text.lowercased()
+        
+        // Remove characters contained in characterFilters
+        let wordsWithoutPunctuationAndNumbers = lowercasedText.components(separatedBy: characterFilters).joined().split(separator: " ")
+        
+        // Remove tokens contained in tokenFilters
+        let wordsWithoutStopWords = wordsWithoutPunctuationAndNumbers.filter { !tokenFilters.contains(String($0)) }
+        
+        return wordsWithoutStopWords.map { String($0) }
+    }
+    
+    
+    static func applyBasicTextProcessing(_ document: [String], characterFilters: CharacterSet, tokenFilters: Set<String>) -> [[String]] {
+        return document.map { applyBasicTextProcessing($0, characterFilters: characterFilters, tokenFilters: tokenFilters) }
+    }
+    
+    mutating func addDocument(document: String) {
+        let processedDocument = Self.applyBasicTextProcessing(document, characterFilters: CharacterSet.punctuationCharacters, tokenFilters: [])
+        addDocument(document: processedDocument)
+    }
+}
--- a/Sources/SwiftNLP/0. SNLP Internal Protocols/SNLPCorpus.swift
+++ b/Sources/SwiftNLP/0. SNLP Internal Protocols/SNLPCorpus.swift
@@ -4,39 +4,30 @@ import Foundation
 protocol SNLPCorpus {
    
    associatedtype DocumentEncoding
-    
-    //var _dictionary: any SNLPDictionary { get set }
+        
    var encodedDocuments: [Int : DocumentEncoding] {get set}
-
+    var defaultDocumentEncoding: DocumentEncoding { get set }
+    
    mutating func addDocument(document: [String])
    mutating func addDocuments(documents: [[String]])
-    
-    static func applyBasicTextProcessing(_ text: String, characterFilters: CharacterSet, tokenFilters: Set<String>) -> [String]
-    static func applyBasicTextProcessing(_ document: [String], characterFilters: CharacterSet, tokenFilters: Set<String>) -> [[String]]
 }


-public enum CorpusOutputFormat {
-    //case Blei
-    case Text
-}
-
 @available(macOS 10.15.0, *)
 extension SNLPCorpus {
    
    subscript(_ index: Int) -> DocumentEncoding {
        get {
-            return encodedDocuments[index]!
+            return encodedDocuments[index] ?? defaultDocumentEncoding
        }
        set(newValue) { // Should we provide a setter? is this useful?
            encodedDocuments[index] = newValue
        }
    }
    
-    
-    // TODO: Is there an easy way to provide a default implementation here? Would need to constrain DocumentEncoding to a Collection or Sequence at a minimum. 
-    //mutating func addDocument(document: Document) {
-    //    encodedDocuments[ encodedDocuments.count ] = dictionary.encode(document: document)
+    // TODO: Is there an easy way to provide a default implementation here?
+    //mutating func addDocument(document: [String]) {
+    //encodedDocuments[ encodedDocuments.count ] = DictionaryType.encode(document)
    //}
    
    mutating func addDocuments(documents: [[String]]) {
@@ -50,27 +41,4 @@ extension SNLPCorpus {
            addDocument(document: document)
        }
    }
-    
-    static func applyBasicTextProcessing(_ text: String, characterFilters: CharacterSet, tokenFilters: Set<String>) -> [String] {
-        let lowercasedText = text.lowercased()
-        
-        // Remove characters contained in characterFilters
-        let wordsWithoutPunctuationAndNumbers = lowercasedText.components(separatedBy: characterFilters).joined().split(separator: " ")
-        
-        // Remove tokens contained in tokenFilters
-        let wordsWithoutStopWords = wordsWithoutPunctuationAndNumbers.filter { !tokenFilters.contains(String($0)) }
-        
-        return wordsWithoutStopWords.map { String($0) }
-    }
-    
-    
-    static func applyBasicTextProcessing(_ document: [String], characterFilters: CharacterSet, tokenFilters: Set<String>) -> [[String]] {
-        return document.map { applyBasicTextProcessing($0, characterFilters: characterFilters, tokenFilters: tokenFilters) }
-    }
-    
-    mutating func addDocument(document: String) {
-        let processedDocument = Self.applyBasicTextProcessing(document, characterFilters: CharacterSet.punctuationCharacters, tokenFilters: [])
-        addDocument(document: processedDocument)
-    }
 }
-    
--- a/Sources/SwiftNLP/0. SNLP Internal Protocols/SNLPDictionary.swift
+++ b/Sources/SwiftNLP/0. SNLP Internal Protocols/SNLPDictionary.swift
@@ -17,6 +17,10 @@ protocol SNLPDictionary : Codable {
    // A dictionary that maps words/tokens in a corpora to numerical values or embeddings
    var token2id: [Key: Value] { get set }
    var count: Int { get }
+    
+    var defaultTokenEncoding: Value { get set }
+    
+    //func encode(token: String) -> Value
 }

 public enum DictionaryType {
@@ -30,14 +34,18 @@ extension SNLPDictionary {

    subscript(_ index: Key) -> Value {
        get {
-            return token2id[index]!
+            return token2id[index, default: defaultTokenEncoding]
        }
        set(newValue) {
            token2id[index] = newValue
        }
    }
    
-    func contains(key: Key) -> Bool {
+//    func encode(_ key: Key) -> Value {
+//        return self[key]
+//    }
+        
+    func contains(_ key: Key) -> Bool {
        return token2id.keys.contains(key)
    }
    

--- a/Sources/SwiftNLP/2. Embeddings/BoWCorpus.swift
+++ b/Sources/SwiftNLP/2. Embeddings/BoWCorpus.swift
@@ -3,7 +3,7 @@ import Foundation

 @available(macOS 10.15.0, *)
 class BoWCorpus: SNLPCorpus {
-            
+                                    
    typealias DocumentEncoding = [(Int, Int)]
    
    
@@ -11,7 +11,7 @@ class BoWCorpus: SNLPCorpus {
            Storage for our documents
     */
    var encodedDocuments: [Int : DocumentEncoding]
-    
+    var defaultDocumentEncoding: [(Int, Int)] = [(0,0)]

    /*
            Storage for Word -> ID mappings
@@ -62,8 +62,8 @@ class BoWCorpus: SNLPCorpus {
            self.tokenFilterSet = Set<String>()
        }
        
-    }                    
-    
+    }
+        
    func addDocument(document: [String]) {
        // Construct a Dictionary containing the count of each token
        var counter: [String : Int] = Dictionary()
@@ -74,7 +74,7 @@ class BoWCorpus: SNLPCorpus {
        
        // Fill in missing values in our Dictionary
        if true {
-            let missing = counter.filter { !dictionary.contains(key: $0.key)  }
+            let missing = counter.filter { !dictionary.contains($0.key)  }
                                  .sorted { $0.key < $1.key }
            
            for (key, _) in missing {

--- a/Sources/SwiftNLP/2. Embeddings/BoWDictionary.swift
+++ b/Sources/SwiftNLP/2. Embeddings/BoWDictionary.swift
@@ -17,7 +17,7 @@ let basicStopwordSet: Set<String> = [


 class BoWDictionary: SNLPDictionary {
-                       
+                                       
    typealias Key = String
    typealias Value = Int
    
@@ -25,6 +25,8 @@ class BoWDictionary: SNLPDictionary {
    var token2id: [Key : Value]
    var id2token: [Value: Key]
    
+    var defaultTokenEncoding: Int = 0
+    
    internal var cfs: [Value: Int]
    internal var dfs: [Value: Int]
    
@@ -35,7 +37,7 @@ class BoWDictionary: SNLPDictionary {
    init() {
        token2id = [Key: Value]()
        id2token = [Value: Key]()
-        
+                
        cfs = [Value: Int]()
        dfs = [Value: Int]()
        

--- a/Sources/SwiftNLP/2. Embeddings/KeyedVectorCorpus.swift
+++ b/Sources/SwiftNLP/2. Embeddings/KeyedVectorCorpus.swift
@@ -12,26 +12,29 @@ import Surge

 @available(macOS 10.15.0, *)
 class KeyedVectorCorpus: SNLPCorpus {
-        
+                
    typealias DocumentEncoding = [Double]
    
    
-    var encodedDocuments: [Int : [Double]] = [:]
+    var encodedDocuments: [Int : DocumentEncoding] = [:]
+    var defaultDocumentEncoding: [Double]
    let width: Int
    
    
    var dictionary: KeyedVectorDictionary
    
    
+    
    init(source: KeyedVectorDictionary.PreComputedEmbeddings) {
        dictionary = KeyedVectorDictionary(source: source)
        width = dictionary.width
+        defaultDocumentEncoding = Array(repeating: Double(0), count: width)
    }
    
    
    func addDocument(document: [String]) {
        
-        var encoding = DocumentEncoding(repeating: 0, count: dictionary.width)
+        var encoding = defaultDocumentEncoding
        for word in document {
            encoding .+= dictionary[word] // Surge in-place element-wise addition
        }

--- a/Sources/SwiftNLP/2. Embeddings/KeyedVectorDictionary.swift
+++ b/Sources/SwiftNLP/2. Embeddings/KeyedVectorDictionary.swift
@@ -12,13 +12,14 @@ import SwiftAnnoy

 @available(macOS 10.15, *)
 class KeyedVectorDictionary: SNLPDictionary {
-    typealias Value = Vector
-    
+                
    typealias Key = String
-    typealias Vector = [Double]
+    typealias Value = [Double]
        
-    var token2id: [Key : Vector]
+    var token2id: [Key : Value]
    let width: Int
+    var defaultTokenEncoding: [Double]
+    
    var count: Int { token2id.count }
    
    public enum PreComputedEmbeddings {
@@ -45,6 +46,8 @@ class KeyedVectorDictionary: SNLPDictionary {
            dictionaryToLoad = "glove.6B.100d"
        }
        
+        defaultTokenEncoding = Array(repeating: Double(0), count: width)
+        
        // Try to load locally first
        guard let url = Bundle.module.url(forResource: dictionaryToLoad, withExtension: "mmap") else {
            debugPrint("File not found in bundle: \(dictionaryToLoad)")
@@ -53,15 +56,12 @@ class KeyedVectorDictionary: SNLPDictionary {
            return
        }
        token2id = KeyedVectorDictionary.readDictionaryFromFile(url)
-        
-        
-        //debugPrint("Loaded \(token2id.count) vectors from file")
-        //debugPrint("")
+                
    }
    
    subscript(_ index: Key) -> Value {
        get {
-            return token2id[index] ?? Array(repeating: Double(0), count: width)
+            return token2id[index] ?? defaultTokenEncoding
        }
        set(newValue) {
            token2id[index] = newValue
@@ -71,7 +71,7 @@ class KeyedVectorDictionary: SNLPDictionary {
    
    // These use memory mapping to load the values in more quickly
    // TODO: Validate that this actually works on other systems... could easily be some issues
-    static func writeDictionaryToFile(url: URL, dictionary: [Key : Vector]) {
+    static func writeDictionaryToFile(url: URL, dictionary: [Key : Value]) {

        let fileManager = FileManager.default
        if !fileManager.fileExists(atPath: url.path) {
@@ -106,10 +106,10 @@ class KeyedVectorDictionary: SNLPDictionary {
    
    // These use memory mapping to load the values in more quickly
    // TODO: Validate that this actually works on other systems... could easily be some issues
-    static func readDictionaryFromFile(_ url: URL) -> [Key : Vector] {
+    static func readDictionaryFromFile(_ url: URL) -> [Key : Value] {
        
        //let fileURL = URL(fileURLWithPath: filename)
-        var result: [Key : Vector]
+        var result: [Key : Value]
        
        do {
            let data = try Data(contentsOf: url, options: .alwaysMapped)
@@ -120,7 +120,7 @@ class KeyedVectorDictionary: SNLPDictionary {
            var index = MemoryLayout<Int>.size

            // Initialize the dictionary with the count
-            result = [Key : Vector](minimumCapacity: count)
+            result = [Key : Value](minimumCapacity: count)
            debugPrint("Loading Dictionary with \(count) items from file.")
            
            while index < data.count {
@@ -151,18 +151,18 @@ class KeyedVectorDictionary: SNLPDictionary {
            print("Error reading dictionary from file: \(error)")
        }
        
-        return [Key : Vector]()
+        return [Key : Value]()
    }
    
    
    // This is the slow way, don't use this for interactive code ...
    // Better to use memory mapped loaders above
-    static func readDictionaryFromTextFile(from url: URL) -> [Key: Vector]? {
+    static func readDictionaryFromTextFile(from url: URL) -> [Key: Value]? {
        do {
            let content = try String(contentsOf: url, encoding: .utf8)
            let lines = content.split(separator: "\n")

-            var data: [Key: Vector] = [:]
+            var data: [Key: Value] = [:]

            for line in lines.dropFirst() {
                let tokens = line.split(separator: " ")

--- a/Sources/SwiftNLP/3. Dimensionality Reduction/TruncatedKeyedVectorCorpus.swift
+++ b/Sources/SwiftNLP/3. Dimensionality Reduction/TruncatedKeyedVectorCorpus.swift
@@ -12,11 +12,12 @@ import Foundation

 @available(macOS 10.15.0, *)
 class TruncatedKeyedVectorCorpus : SNLPCorpus {
-        
+                
    typealias DocumentEncoding = [Double]
    
    
    var encodedDocuments: [Int : DocumentEncoding] = [:]
+    var defaultDocumentEncoding: [Double]
    
    private let _corpus: KeyedVectorCorpus // reference to the corpus we want to copy
    
@@ -34,6 +35,8 @@ class TruncatedKeyedVectorCorpus : SNLPCorpus {
        for document in _corpus.encodedDocuments {
            encodedDocuments[document.key] = Array(document.value.prefix(upTo: d))
        }
+        
+        defaultDocumentEncoding = Array(repeating: Double(0), count: d)
    }