Skip to content
Snippets Groups Projects
Commit 5dd23a4d authored by Jim Wallace's avatar Jim Wallace
Browse files

Cleaning up SNLPCorpus

parent 720d5322
No related branches found
No related tags found
No related merge requests found
Pipeline #108864 passed
......@@ -24,6 +24,13 @@
import Foundation
extension SNLPCorpus {
/**
Takes some untokenized text and:
- Converts to lowercare
- Removes stop words, punctuation, and numbers
- tokenizes based on whitespace
*/
static func applyBasicTextProcessing(_ text: String, characterFilters: CharacterSet, tokenFilters: Set<String>) -> [String] {
let lowercasedText = text.lowercased()
......@@ -37,12 +44,31 @@ extension SNLPCorpus {
}
/**
Takes some tokenized text and:
- Converts to lowercare
- Removes stop words, punctuation, and numbers
*/
static func applyBasicTextProcessing(_ document: [String], characterFilters: CharacterSet, tokenFilters: Set<String>) -> [[String]] {
return document.map { applyBasicTextProcessing($0, characterFilters: characterFilters, tokenFilters: tokenFilters) }
}
mutating func addDocument(document: String) {
/**
Adds a single untokenized document to the corpus, using default tokenization and text processing
*/
mutating func addUntokenizedDocument(_ document: String) {
let processedDocument = Self.applyBasicTextProcessing(document, characterFilters: CharacterSet.punctuationCharacters, tokenFilters: [])
addDocument(document: processedDocument)
}
/**
Adds a series of untokenized documents to the corpus, using default tokenization and text processing
*/
@inlinable
mutating func addUntokenizedDocuments(_ documents: [String]) {
for d in documents {
addUntokenizedDocument(d)
}
}
}
......@@ -30,12 +30,12 @@ protocol SNLPCorpus {
var _documentEncoder: DocumentEncoder { get set }
var zeroes: DocumentEncoding { get }
//var width: Int { get }
//var rawDocuments: [ Int : String ] { get set }
var encodedDocuments: [Int : DocumentEncoding] { get set }
mutating func addDocument(document: [String])
mutating func addDocuments(documents: [[String]])
//mutating func addDocuments(documents: [[String]])
}
......@@ -62,20 +62,6 @@ extension SNLPCorpus {
}
encodedDocuments[ encodedDocuments.count ] = result
debugPrint("--> \(result)")
}
@inlinable
mutating func addDocuments(documents: [[String]]) {
for document in documents {
addDocument(document: document)
}
}
@inlinable
mutating func addDocuments(documents: [String]) {
for document in documents {
addDocument(document: document)
}
//debugPrint("--> \(result)")
}
}
......@@ -24,7 +24,7 @@
import Foundation
class DictionaryCorpus<Precision: Collection & Codable>: SNLPCorpus where Precision.Element: BinaryFloatingPoint {
typealias DocumentEncoding = Precision
typealias DocumentEncoder = ContextFreeEncoder<DocumentEncoding>
......@@ -40,15 +40,15 @@ class DictionaryCorpus<Precision: Collection & Codable>: SNLPCorpus where Precis
}
// TODO: This isn't the most elegant ... rework?
func addDocument(document: [String]) {
func addTokenizedDocument(document: [String]) {
var result = Array<Precision.Element>(repeating: Precision.Element(0.0), count: _documentEncoder.width)
for token in document {
var encoding = _documentEncoder.encode(token: token)
let encoding = _documentEncoder.encode(token: token)
for i in 0 ..< result.count {
result[i] += encoding[i as! Precision.Index]
}
}
encodedDocuments[ encodedDocuments.count ] = result as! Precision
encodedDocuments[ encodedDocuments.count ] = (result as! Precision)
}
}
......@@ -14,7 +14,7 @@ final class ContextFreeEncoderTests: XCTestCase {
]
var corpus = DictionaryCorpus(encoding: .glove6B50d)
corpus.addDocuments(documents: docs)
corpus.addUntokenizedDocuments(docs)
XCTAssert(corpus.encodedDocuments.count == 3)
......@@ -51,7 +51,7 @@ final class ContextFreeEncoderTests: XCTestCase {
]
var corpus = DictionaryCorpus(encoding: .glove6B50d)
corpus.addDocuments(documents: twentyQuotes)
corpus.addUntokenizedDocuments(twentyQuotes)
XCTAssertEqual(corpus.encodedDocuments.count, 20)
......@@ -76,7 +76,7 @@ final class ContextFreeEncoderTests: XCTestCase {
var corpus = DictionaryCorpus(encoding: .glove6B50d)
for submission in submissions {
if let text = submission.selftext {
corpus.addDocument(document: text)
corpus.addUntokenizedDocument(text)
}
}
//print("Loaded \(corpus.encodedDocuments.count) documents.")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment