A first pass at SNLPCorpus.

3f908e3b · Jim Wallace · c7724185 · 3f908e3b · 3f908e3b · c7724185
Commit 3f908e3b authored 2 years ago by Jim Wallace
--- a/Sources/SwiftNLP/SNLPCorpus.swift
+++ b/Sources/SwiftNLP/SNLPCorpus.swift
+//
+//  preprocessing.swift
+//  SwiftNLP
+//
+//  Created by Jim Wallace on 2023-03-02.
+//
+
+import Foundation
+import NaturalLanguage
+
+
+class SNLPCorpus {
+        
+    var documents: [Document]
+    private var _dictionary:  SNLPDictionary<String, Int>
+    
+    var dictionary: SNLPDictionary<String, Int> {
+        get {
+            return self._dictionary
+        }
+        set {
+            self._dictionary = newValue
+        }
+    }
+    
+    var characterFilterSet: CharacterSet
+    //let tokenizer = ????
+    var tokenFilterSet: Set<Word>
+    
+        
+    init(_ input: [String]? = nil, characterFilters: [CharacterSet]? = [CharacterSet.punctuationCharacters,CharacterSet.decimalDigits], tokenFilters: [Set<Word>]? = [basicStopwordSet]) {
+        
+        self.documents = []
+        self._dictionary = SNLPDictionary<String, Int>()
+        
+        // Create a set of Characters to filter out
+        if let filterSet = characterFilters {
+            var union = CharacterSet()
+            for set in filterSet {
+                union.formUnion(set)
+            }
+            self.characterFilterSet = union
+        } else {
+            self.characterFilterSet = []
+        }
+        
+        // Create a set of Tokens to filter out
+        if let tokens = tokenFilters {
+            var union = Set<Word>()
+            for set in tokens {
+                union.formUnion(set)
+            }
+            self.tokenFilterSet = union
+        } else {
+            self.tokenFilterSet = Set<Word>()
+        }
+        
+        // Process the text we've been given
+        if let text = input {
+            documents = applyBasicTextProcessing(text, characterFilters: self.characterFilterSet, tokenFilters: self.tokenFilterSet)
+        }
+                
+        // Create our Corpus and Dictionary
+        initializeDictionary()
+    }
+    
+    
+    
+    
+    func initializeDictionary() {
+        self.dictionary.addDocuments(documents: documents)
+    }
+    
+    
+    func applyBasicTextProcessing(_ text: String, characterFilters: CharacterSet, tokenFilters: Set<String>) -> Document {
+        let lowercasedText = text.lowercased()
+        
+        // Remove characters contained in characterFilters
+        let wordsWithoutPunctuationAndNumbers = lowercasedText.components(separatedBy: characterFilters).joined().split(separator: " ")
+        
+        // Remove tokens contained in tokenFilters
+        let wordsWithoutStopWords = wordsWithoutPunctuationAndNumbers.filter { !tokenFilters.contains(String($0)) }
+        
+        return wordsWithoutStopWords.map { String($0) }
+    }
+    
+    
+    func applyBasicTextProcessing(_ documents: [String], characterFilters: CharacterSet, tokenFilters: Set<String>) -> [Document] {
+        return documents.map { applyBasicTextProcessing($0, characterFilters: characterFilters, tokenFilters: tokenFilters) }
+    }
+    
+}
--- a/Sources/SwiftNLP/SNLPDefinitions.swift
+++ b/Sources/SwiftNLP/SNLPDefinitions.swift
@@ -10,7 +10,17 @@ import Foundation
 public typealias Word = String
 public typealias Document = [Word]
 public typealias Corpus = [Document]
+
+public typealias EncodedWord = any BinaryInteger
+public typealias EncodedDocument = [EncodedWord]
+public typealias EncodedCorpus = [EncodedDocument]
+
 public typealias Topic = [(word: Word, probability: Double)]
 public typealias TopicDistribution = [Topic]

+let basicStopwordSet: Set<Word> = [
+"all", "six", "just", "less", "being", "indeed", "over", "move", "anyway", "four", "not", "own", "through", "using", "fifty", "where", "mill", "only", "find", "before", "one", "whose", "system", "how", "somewhere", "much", "thick", "show", "had", "enough", "should", "to", "must", "whom", "seeming", "yourselves", "under", "ours", "two", "has", "might", "thereafter", "latterly", "do", "them", "his", "around", "than", "get", "very", "de", "none", "cannot", "every", "un", "they", "front", "during", "thus", "now", "him", "nor", "name", "regarding", "several", "hereafter", "did", "always", "who", "didn", "whither", "this", "someone", "either", "each", "become", "thereupon", "sometime", "side", "towards", "therein", "twelve", "because", "often", "ten", "our", "doing", "km", "eg", "some", "back", "used", "up", "go", "namely", "computer", "are", "further", "beyond", "ourselves", "yet", "out", "even", "will", "what", "still", "for", "bottom", "mine", "since", "please", "forty", "per", "its", "everything", "behind", "does", "various", "above", "between", "it", "neither", "seemed", "ever", "across", "she", "somehow", "be", "we", "full", "never", "sixty", "however", "here", "otherwise", "were", "whereupon", "nowhere", "although", "found", "alone", "re", "along", "quite", "fifteen", "by", "both", "about", "last", "would", "anything", "via", "many", "could", "thence", "put", "against", "keep", "etc", "amount", "became", "ltd", "hence", "onto", "or", "con", "among", "already", "co", "afterwards", "formerly", "within", "seems", "into", "others", "while", "whatever", "except", "down", "hers", "everyone", "done", "least", "another", "whoever", "moreover", "couldnt", "throughout", "anyhow", "yourself", "three", "from", "her", "few", "together", "top", "there", "due", "been", "next", "anyone", "eleven", "cry", "call", "therefore", "interest", "then", "thru", "themselves", "hundred", "really", "sincere", "empty", "more", "himself", "elsewhere", "mostly", "on", "fire", "am", "becoming", "hereby", "amongst", "else", "amongst", "else", "part", "everywhere", "too", "kg", "herself", "former", "those", "he", "me", "myself", "made", "twenty", "these", "was", "bill", "cant", "us", "until", "besides", "nevertheless", "below", "anywhere", "nine", "can", "whether", "of", "your", "toward", "my", "say", "something", "and", "whereafter", "whenever", "give", "almost", "wherever", "is", "describe", "beforehand", "herein", "doesn", "an", "as", "itself", "at", "have", "in", "seem", "whence", "ie", "any", "fill", "again", "hasnt", "inc", "thereby", "thin", "no", "perhaps", "latter", "meanwhile", "when", "detail", "same", "wherein", "beside", "also", "that", "other", "take", "which", "becomes", "you", "if", "nobody", "unless", "whereas", "see", "though", "may", "after", "upon", "most", "hereupon", "eight", "but", "serious", "nothing", "such", "why", "off", "a", "don", "whereby", "third", "i", "whole", "noone", "sometimes", "well", "amoungst", "yours", "their", "rather", "without", "so", "five", "the", "first", "with", "make", "once"
+]
+
+
 //public typealias LDAResultHandler = (Result<TopicDistribution, Error>) -> Void // Not sure this is necessary? 
--- a/Sources/SwiftNLP/preprocessing.swift
+++ b/Sources/SwiftNLP/preprocessing.swift
-//
-//  preprocessing.swift
-//  SwiftNLP
-//
-//  Created by Jim Wallace on 2023-03-02.
-//
-
-import Foundation
-import NaturalLanguage
-
-
-let basicStopwordSet: Set<Word> = [
-"all", "six", "just", "less", "being", "indeed", "over", "move", "anyway", "four", "not", "own", "through", "using", "fifty", "where", "mill", "only", "find", "before", "one", "whose", "system", "how", "somewhere", "much", "thick", "show", "had", "enough", "should", "to", "must", "whom", "seeming", "yourselves", "under", "ours", "two", "has", "might", "thereafter", "latterly", "do", "them", "his", "around", "than", "get", "very", "de", "none", "cannot", "every", "un", "they", "front", "during", "thus", "now", "him", "nor", "name", "regarding", "several", "hereafter", "did", "always", "who", "didn", "whither", "this", "someone", "either", "each", "become", "thereupon", "sometime", "side", "towards", "therein", "twelve", "because", "often", "ten", "our", "doing", "km", "eg", "some", "back", "used", "up", "go", "namely", "computer", "are", "further", "beyond", "ourselves", "yet", "out", "even", "will", "what", "still", "for", "bottom", "mine", "since", "please", "forty", "per", "its", "everything", "behind", "does", "various", "above", "between", "it", "neither", "seemed", "ever", "across", "she", "somehow", "be", "we", "full", "never", "sixty", "however", "here", "otherwise", "were", "whereupon", "nowhere", "although", "found", "alone", "re", "along", "quite", "fifteen", "by", "both", "about", "last", "would", "anything", "via", "many", "could", "thence", "put", "against", "keep", "etc", "amount", "became", "ltd", "hence", "onto", "or", "con", "among", "already", "co", "afterwards", "formerly", "within", "seems", "into", "others", "while", "whatever", "except", "down", "hers", "everyone", "done", "least", "another", "whoever", "moreover", "couldnt", "throughout", "anyhow", "yourself", "three", "from", "her", "few", "together", "top", "there", "due", "been", "next", "anyone", "eleven", "cry", "call", "therefore", "interest", "then", "thru", "themselves", "hundred", "really", "sincere", "empty", "more", "himself", "elsewhere", "mostly", "on", "fire", "am", "becoming", "hereby", "amongst", "else", "amongst", "else", "part", "everywhere", "too", "kg", "herself", "former", "those", "he", "me", "myself", "made", "twenty", "these", "was", "bill", "cant", "us", "until", "besides", "nevertheless", "below", "anywhere", "nine", "can", "whether", "of", "your", "toward", "my", "say", "something", "and", "whereafter", "whenever", "give", "almost", "wherever", "is", "describe", "beforehand", "herein", "doesn", "an", "as", "itself", "at", "have", "in", "seem", "whence", "ie", "any", "fill", "again", "hasnt", "inc", "thereby", "thin", "no", "perhaps", "latter", "meanwhile", "when", "detail", "same", "wherein", "beside", "also", "that", "other", "take", "which", "becomes", "you", "if", "nobody", "unless", "whereas", "see", "though", "may", "after", "upon", "most", "hereupon", "eight", "but", "serious", "nothing", "such", "why", "off", "a", "don", "whereby", "third", "i", "whole", "noone", "sometimes", "well", "amoungst", "yours", "their", "rather", "without", "so", "five", "the", "first", "with", "make", "once"
-]
-
-
-func applyBasicTextProcessing(_ text: String, stopwords: Set<String> = basicStopwordSet) -> Document {
-    let lowercasedText = text.lowercased()
-    
-    // Create a custom character set that includes punctuation and numeric characters
-    let basicFilterCharacterSet = CharacterSet.punctuationCharacters.union(CharacterSet.decimalDigits)
-    
-    // Remove punctuation and numeric characters
-    let wordsWithoutPunctuationAndNumbers = lowercasedText.components(separatedBy: basicFilterCharacterSet).joined().split(separator: " ")
-    
-    // Remove stop words
-    let wordsWithoutStopWords = wordsWithoutPunctuationAndNumbers.filter { !stopwords.contains(String($0)) }
-    
-    return wordsWithoutStopWords.map { String($0) }
-}
-
-
-func applyBasicTextProcessing(_ documents: [String], stopwords: Set<String> = basicStopwordSet) -> [Document] {
-    return documents.map { applyBasicTextProcessing($0, stopwords: stopwords) }
-}
--- a/Tests/SwiftNLPTests/SNLPDictionaryTests.swift
+++ b/Tests/SwiftNLPTests/SNLPDictionaryTests.swift
@@ -13,16 +13,18 @@ final class SwiftNLPTests: XCTestCase {
            "that enable us to train deep learning algorithms to learn like the human brain."
         ]
        
-        let cleanDocs = applyBasicTextProcessing(docs)
+        let corpus: SNLPCorpus = SNLPCorpus(docs)
        
-        var dict = SNLPDictionary<Word, Int>()
+        //let cleanDocs = applyBasicTextProcessing(docs)
        
-        dict.addDocuments(documents: cleanDocs)
+        //var dict = SNLPDictionary<Word, Int>()
+        
+        //dict.addDocuments(documents: cleanDocs)
        //debugPrint(dict)
        
-        XCTAssertEqual(dict.numDocs, 3)
-        XCTAssertEqual(dict.numPos, 19)
-        XCTAssertEqual(dict.numNNZ, 19)
+        XCTAssertEqual(corpus.dictionary.numDocs, 3)
+        XCTAssertEqual(corpus.dictionary.numPos, 19)
+        XCTAssertEqual(corpus.dictionary.numNNZ, 19)
    }
    
    func testBiggerExample() throws {
@@ -50,20 +52,22 @@ final class SwiftNLPTests: XCTestCase {
            "All science is either physics or stamp collecting. - Ernest Rutherford"
        ]

-        var dict = SNLPDictionary<Word, Int>()
+        let corpus = SNLPCorpus(twentyQuotes)
+        
+        //var dict = SNLPDictionary<Word, Int>()
        
-        let cleanedQuotes = applyBasicTextProcessing(twentyQuotes)
+        //let cleanedQuotes = applyBasicTextProcessing(twentyQuotes)
        
        //debugPrint(cleanedQuotes)
        
-        dict.addDocuments(documents: cleanedQuotes)
+        //dict.addDocuments(documents: cleanedQuotes)
        //debugPrint(dict)
        
-        debugPrint(dict.mostCommon(10))
+        //debugPrint(dict.mostCommon(10))
        
-        XCTAssertEqual(dict.numDocs, 20)
-        XCTAssertEqual(dict.numPos, 188)
-        XCTAssertEqual(dict.numNNZ, 178)
+        XCTAssertEqual(corpus.dictionary.numDocs, 20)
+        XCTAssertEqual(corpus.dictionary.numPos, 188)
+        XCTAssertEqual(corpus.dictionary.numNNZ, 178)
        
    }
 }