Skip to content
Snippets Groups Projects
Commit 3f908e3b authored by Jim Wallace's avatar Jim Wallace
Browse files

A first pass at SNLPCorpus.

parent c7724185
No related branches found
No related tags found
No related merge requests found
Pipeline #89000 passed
//
// preprocessing.swift
// SwiftNLP
//
// Created by Jim Wallace on 2023-03-02.
//
import Foundation
import NaturalLanguage
class SNLPCorpus {
var documents: [Document]
private var _dictionary: SNLPDictionary<String, Int>
var dictionary: SNLPDictionary<String, Int> {
get {
return self._dictionary
}
set {
self._dictionary = newValue
}
}
var characterFilterSet: CharacterSet
//let tokenizer = ????
var tokenFilterSet: Set<Word>
init(_ input: [String]? = nil, characterFilters: [CharacterSet]? = [CharacterSet.punctuationCharacters,CharacterSet.decimalDigits], tokenFilters: [Set<Word>]? = [basicStopwordSet]) {
self.documents = []
self._dictionary = SNLPDictionary<String, Int>()
// Create a set of Characters to filter out
if let filterSet = characterFilters {
var union = CharacterSet()
for set in filterSet {
union.formUnion(set)
}
self.characterFilterSet = union
} else {
self.characterFilterSet = []
}
// Create a set of Tokens to filter out
if let tokens = tokenFilters {
var union = Set<Word>()
for set in tokens {
union.formUnion(set)
}
self.tokenFilterSet = union
} else {
self.tokenFilterSet = Set<Word>()
}
// Process the text we've been given
if let text = input {
documents = applyBasicTextProcessing(text, characterFilters: self.characterFilterSet, tokenFilters: self.tokenFilterSet)
}
// Create our Corpus and Dictionary
initializeDictionary()
}
func initializeDictionary() {
self.dictionary.addDocuments(documents: documents)
}
func applyBasicTextProcessing(_ text: String, characterFilters: CharacterSet, tokenFilters: Set<String>) -> Document {
let lowercasedText = text.lowercased()
// Remove characters contained in characterFilters
let wordsWithoutPunctuationAndNumbers = lowercasedText.components(separatedBy: characterFilters).joined().split(separator: " ")
// Remove tokens contained in tokenFilters
let wordsWithoutStopWords = wordsWithoutPunctuationAndNumbers.filter { !tokenFilters.contains(String($0)) }
return wordsWithoutStopWords.map { String($0) }
}
func applyBasicTextProcessing(_ documents: [String], characterFilters: CharacterSet, tokenFilters: Set<String>) -> [Document] {
return documents.map { applyBasicTextProcessing($0, characterFilters: characterFilters, tokenFilters: tokenFilters) }
}
}
......@@ -10,7 +10,17 @@ import Foundation
public typealias Word = String
public typealias Document = [Word]
public typealias Corpus = [Document]
public typealias EncodedWord = any BinaryInteger
public typealias EncodedDocument = [EncodedWord]
public typealias EncodedCorpus = [EncodedDocument]
public typealias Topic = [(word: Word, probability: Double)]
public typealias TopicDistribution = [Topic]
let basicStopwordSet: Set<Word> = [
"all", "six", "just", "less", "being", "indeed", "over", "move", "anyway", "four", "not", "own", "through", "using", "fifty", "where", "mill", "only", "find", "before", "one", "whose", "system", "how", "somewhere", "much", "thick", "show", "had", "enough", "should", "to", "must", "whom", "seeming", "yourselves", "under", "ours", "two", "has", "might", "thereafter", "latterly", "do", "them", "his", "around", "than", "get", "very", "de", "none", "cannot", "every", "un", "they", "front", "during", "thus", "now", "him", "nor", "name", "regarding", "several", "hereafter", "did", "always", "who", "didn", "whither", "this", "someone", "either", "each", "become", "thereupon", "sometime", "side", "towards", "therein", "twelve", "because", "often", "ten", "our", "doing", "km", "eg", "some", "back", "used", "up", "go", "namely", "computer", "are", "further", "beyond", "ourselves", "yet", "out", "even", "will", "what", "still", "for", "bottom", "mine", "since", "please", "forty", "per", "its", "everything", "behind", "does", "various", "above", "between", "it", "neither", "seemed", "ever", "across", "she", "somehow", "be", "we", "full", "never", "sixty", "however", "here", "otherwise", "were", "whereupon", "nowhere", "although", "found", "alone", "re", "along", "quite", "fifteen", "by", "both", "about", "last", "would", "anything", "via", "many", "could", "thence", "put", "against", "keep", "etc", "amount", "became", "ltd", "hence", "onto", "or", "con", "among", "already", "co", "afterwards", "formerly", "within", "seems", "into", "others", "while", "whatever", "except", "down", "hers", "everyone", "done", "least", "another", "whoever", "moreover", "couldnt", "throughout", "anyhow", "yourself", "three", "from", "her", "few", "together", "top", "there", "due", "been", "next", "anyone", "eleven", "cry", "call", "therefore", "interest", "then", "thru", "themselves", "hundred", "really", "sincere", "empty", "more", "himself", "elsewhere", "mostly", "on", "fire", "am", "becoming", "hereby", "amongst", "else", "amongst", "else", "part", "everywhere", "too", "kg", "herself", "former", "those", "he", "me", "myself", "made", "twenty", "these", "was", "bill", "cant", "us", "until", "besides", "nevertheless", "below", "anywhere", "nine", "can", "whether", "of", "your", "toward", "my", "say", "something", "and", "whereafter", "whenever", "give", "almost", "wherever", "is", "describe", "beforehand", "herein", "doesn", "an", "as", "itself", "at", "have", "in", "seem", "whence", "ie", "any", "fill", "again", "hasnt", "inc", "thereby", "thin", "no", "perhaps", "latter", "meanwhile", "when", "detail", "same", "wherein", "beside", "also", "that", "other", "take", "which", "becomes", "you", "if", "nobody", "unless", "whereas", "see", "though", "may", "after", "upon", "most", "hereupon", "eight", "but", "serious", "nothing", "such", "why", "off", "a", "don", "whereby", "third", "i", "whole", "noone", "sometimes", "well", "amoungst", "yours", "their", "rather", "without", "so", "five", "the", "first", "with", "make", "once"
]
//public typealias LDAResultHandler = (Result<TopicDistribution, Error>) -> Void // Not sure this is necessary?
//
// preprocessing.swift
// SwiftNLP
//
// Created by Jim Wallace on 2023-03-02.
//
import Foundation
import NaturalLanguage
let basicStopwordSet: Set<Word> = [
"all", "six", "just", "less", "being", "indeed", "over", "move", "anyway", "four", "not", "own", "through", "using", "fifty", "where", "mill", "only", "find", "before", "one", "whose", "system", "how", "somewhere", "much", "thick", "show", "had", "enough", "should", "to", "must", "whom", "seeming", "yourselves", "under", "ours", "two", "has", "might", "thereafter", "latterly", "do", "them", "his", "around", "than", "get", "very", "de", "none", "cannot", "every", "un", "they", "front", "during", "thus", "now", "him", "nor", "name", "regarding", "several", "hereafter", "did", "always", "who", "didn", "whither", "this", "someone", "either", "each", "become", "thereupon", "sometime", "side", "towards", "therein", "twelve", "because", "often", "ten", "our", "doing", "km", "eg", "some", "back", "used", "up", "go", "namely", "computer", "are", "further", "beyond", "ourselves", "yet", "out", "even", "will", "what", "still", "for", "bottom", "mine", "since", "please", "forty", "per", "its", "everything", "behind", "does", "various", "above", "between", "it", "neither", "seemed", "ever", "across", "she", "somehow", "be", "we", "full", "never", "sixty", "however", "here", "otherwise", "were", "whereupon", "nowhere", "although", "found", "alone", "re", "along", "quite", "fifteen", "by", "both", "about", "last", "would", "anything", "via", "many", "could", "thence", "put", "against", "keep", "etc", "amount", "became", "ltd", "hence", "onto", "or", "con", "among", "already", "co", "afterwards", "formerly", "within", "seems", "into", "others", "while", "whatever", "except", "down", "hers", "everyone", "done", "least", "another", "whoever", "moreover", "couldnt", "throughout", "anyhow", "yourself", "three", "from", "her", "few", "together", "top", "there", "due", "been", "next", "anyone", "eleven", "cry", "call", "therefore", "interest", "then", "thru", "themselves", "hundred", "really", "sincere", "empty", "more", "himself", "elsewhere", "mostly", "on", "fire", "am", "becoming", "hereby", "amongst", "else", "amongst", "else", "part", "everywhere", "too", "kg", "herself", "former", "those", "he", "me", "myself", "made", "twenty", "these", "was", "bill", "cant", "us", "until", "besides", "nevertheless", "below", "anywhere", "nine", "can", "whether", "of", "your", "toward", "my", "say", "something", "and", "whereafter", "whenever", "give", "almost", "wherever", "is", "describe", "beforehand", "herein", "doesn", "an", "as", "itself", "at", "have", "in", "seem", "whence", "ie", "any", "fill", "again", "hasnt", "inc", "thereby", "thin", "no", "perhaps", "latter", "meanwhile", "when", "detail", "same", "wherein", "beside", "also", "that", "other", "take", "which", "becomes", "you", "if", "nobody", "unless", "whereas", "see", "though", "may", "after", "upon", "most", "hereupon", "eight", "but", "serious", "nothing", "such", "why", "off", "a", "don", "whereby", "third", "i", "whole", "noone", "sometimes", "well", "amoungst", "yours", "their", "rather", "without", "so", "five", "the", "first", "with", "make", "once"
]
func applyBasicTextProcessing(_ text: String, stopwords: Set<String> = basicStopwordSet) -> Document {
let lowercasedText = text.lowercased()
// Create a custom character set that includes punctuation and numeric characters
let basicFilterCharacterSet = CharacterSet.punctuationCharacters.union(CharacterSet.decimalDigits)
// Remove punctuation and numeric characters
let wordsWithoutPunctuationAndNumbers = lowercasedText.components(separatedBy: basicFilterCharacterSet).joined().split(separator: " ")
// Remove stop words
let wordsWithoutStopWords = wordsWithoutPunctuationAndNumbers.filter { !stopwords.contains(String($0)) }
return wordsWithoutStopWords.map { String($0) }
}
func applyBasicTextProcessing(_ documents: [String], stopwords: Set<String> = basicStopwordSet) -> [Document] {
return documents.map { applyBasicTextProcessing($0, stopwords: stopwords) }
}
......@@ -13,16 +13,18 @@ final class SwiftNLPTests: XCTestCase {
"that enable us to train deep learning algorithms to learn like the human brain."
]
let cleanDocs = applyBasicTextProcessing(docs)
let corpus: SNLPCorpus = SNLPCorpus(docs)
var dict = SNLPDictionary<Word, Int>()
//let cleanDocs = applyBasicTextProcessing(docs)
dict.addDocuments(documents: cleanDocs)
//var dict = SNLPDictionary<Word, Int>()
//dict.addDocuments(documents: cleanDocs)
//debugPrint(dict)
XCTAssertEqual(dict.numDocs, 3)
XCTAssertEqual(dict.numPos, 19)
XCTAssertEqual(dict.numNNZ, 19)
XCTAssertEqual(corpus.dictionary.numDocs, 3)
XCTAssertEqual(corpus.dictionary.numPos, 19)
XCTAssertEqual(corpus.dictionary.numNNZ, 19)
}
func testBiggerExample() throws {
......@@ -50,20 +52,22 @@ final class SwiftNLPTests: XCTestCase {
"All science is either physics or stamp collecting. - Ernest Rutherford"
]
var dict = SNLPDictionary<Word, Int>()
let corpus = SNLPCorpus(twentyQuotes)
//var dict = SNLPDictionary<Word, Int>()
let cleanedQuotes = applyBasicTextProcessing(twentyQuotes)
//let cleanedQuotes = applyBasicTextProcessing(twentyQuotes)
//debugPrint(cleanedQuotes)
dict.addDocuments(documents: cleanedQuotes)
//dict.addDocuments(documents: cleanedQuotes)
//debugPrint(dict)
debugPrint(dict.mostCommon(10))
//debugPrint(dict.mostCommon(10))
XCTAssertEqual(dict.numDocs, 20)
XCTAssertEqual(dict.numPos, 188)
XCTAssertEqual(dict.numNNZ, 178)
XCTAssertEqual(corpus.dictionary.numDocs, 20)
XCTAssertEqual(corpus.dictionary.numPos, 188)
XCTAssertEqual(corpus.dictionary.numNNZ, 178)
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment