Skip to content
Snippets Groups Projects
Commit 2d561b2a authored by Jim Wallace's avatar Jim Wallace
Browse files

Stores EncodedDocument in Corpus. TODO: Need to clean up types across definitions/corpus/dictionary

parent 3f908e3b
No related branches found
No related tags found
No related merge requests found
Pipeline #89053 passed
...@@ -12,6 +12,8 @@ import NaturalLanguage ...@@ -12,6 +12,8 @@ import NaturalLanguage
class SNLPCorpus { class SNLPCorpus {
var documents: [Document] var documents: [Document]
var encodedDocuments: [[(Int, Int)]]
private var _dictionary: SNLPDictionary<String, Int> private var _dictionary: SNLPDictionary<String, Int>
var dictionary: SNLPDictionary<String, Int> { var dictionary: SNLPDictionary<String, Int> {
...@@ -31,6 +33,8 @@ class SNLPCorpus { ...@@ -31,6 +33,8 @@ class SNLPCorpus {
init(_ input: [String]? = nil, characterFilters: [CharacterSet]? = [CharacterSet.punctuationCharacters,CharacterSet.decimalDigits], tokenFilters: [Set<Word>]? = [basicStopwordSet]) { init(_ input: [String]? = nil, characterFilters: [CharacterSet]? = [CharacterSet.punctuationCharacters,CharacterSet.decimalDigits], tokenFilters: [Set<Word>]? = [basicStopwordSet]) {
self.documents = [] self.documents = []
self.encodedDocuments = []
self._dictionary = SNLPDictionary<String, Int>() self._dictionary = SNLPDictionary<String, Int>()
// Create a set of Characters to filter out // Create a set of Characters to filter out
...@@ -68,7 +72,7 @@ class SNLPCorpus { ...@@ -68,7 +72,7 @@ class SNLPCorpus {
func initializeDictionary() { func initializeDictionary() {
self.dictionary.addDocuments(documents: documents) encodedDocuments = self.dictionary.addDocuments(documents: documents)
} }
......
...@@ -11,10 +11,12 @@ public typealias Word = String ...@@ -11,10 +11,12 @@ public typealias Word = String
public typealias Document = [Word] public typealias Document = [Word]
public typealias Corpus = [Document] public typealias Corpus = [Document]
public typealias EncodedWord = any BinaryInteger public typealias EncodedWord = (Int, Int)
public typealias EncodedDocument = [EncodedWord] public typealias EncodedDocument = [EncodedWord]
public typealias EncodedCorpus = [EncodedDocument] public typealias EncodedCorpus = [EncodedDocument]
public typealias EncodedWerd = BinaryInteger & Codable
public typealias Topic = [(word: Word, probability: Double)] public typealias Topic = [(word: Word, probability: Double)]
public typealias TopicDistribution = [Topic] public typealias TopicDistribution = [Topic]
......
...@@ -90,10 +90,15 @@ struct SNLPDictionary<Key: Hashable & Codable & Comparable, Value: BinaryInteger ...@@ -90,10 +90,15 @@ struct SNLPDictionary<Key: Hashable & Codable & Comparable, Value: BinaryInteger
// >>> len(dct) // >>> len(dct)
// 10 // 10
// """ // """
mutating func addDocuments(documents: [Document]) { mutating func addDocuments(documents: [Document]) -> [[(Value, Int)]] {
var result = [[(Value, Int)]]()
for document in documents { for document in documents {
documentToBagOfWords(document: document, allowUpdate: true) result.append(documentToBagOfWords(document: document, allowUpdate: true))
} }
return result
} }
...@@ -125,37 +130,28 @@ struct SNLPDictionary<Key: Hashable & Codable & Comparable, Value: BinaryInteger ...@@ -125,37 +130,28 @@ struct SNLPDictionary<Key: Hashable & Codable & Comparable, Value: BinaryInteger
// ([(2, 1)], {u'this': 1, u'is': 1}) // ([(2, 1)], {u'this': 1, u'is': 1})
// """ // """
mutating func documentToBagOfWords(document: Document, allowUpdate: Bool = false, returnMissing: Bool = false) { mutating func documentToBagOfWords(document: Document, allowUpdate: Bool = false) -> [(Value, Int)] {
// Construct a Dictionary containing the count of each token // Construct a Dictionary containing the count of each token
var counter: [Key : Int] = Dictionary() var counter: [Word : Int] = Dictionary()
for token in document { for token in document {
counter[token as! Key, default: 0] += 1 counter[ token, default: 0] += 1
} }
// Fill in missing values in our Dictionary // Fill in missing values in our Dictionary
if allowUpdate || returnMissing { if allowUpdate {
let missing = counter.filter { !token2id.keys.contains($0.key) } let missing = counter.filter { !token2id.keys.contains($0.key as! Key) }
.sorted { $0.key < $1.key } .sorted { $0.key < $1.key }
if allowUpdate {
for (key, _) in missing { for (key, _) in missing {
token2id[key] = Value(token2id.count) token2id[ key as! Key ] = Value(token2id.count)
}
} }
} }
// Create a result // Create a result
let result = counter.compactMap { (key, value) -> (Value, Int)? in //TODO: Confirm that ChatGPT generated this correctly let result = counter.compactMap({ (token2id[$0.key as! Key]!, $0.value) })
guard let tokenIndex = token2id[key] else {
return nil
}
return (Value(tokenIndex), value) as (Value, Int)
}
.reduce(into: [Value: Int]()) { dict, tuple in
dict[tuple.0] = tuple.1
}
// Update our counters // Update our counters
if allowUpdate { if allowUpdate {
numDocs += 1 numDocs += 1
...@@ -168,7 +164,7 @@ struct SNLPDictionary<Key: Hashable & Codable & Comparable, Value: BinaryInteger ...@@ -168,7 +164,7 @@ struct SNLPDictionary<Key: Hashable & Codable & Comparable, Value: BinaryInteger
self.dfs[tokenid, default: 0] += 1 self.dfs[tokenid, default: 0] += 1
} }
} }
return result
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment