Skip to content
Snippets Groups Projects
Commit 8d6c3c05 authored by Jim Wallace's avatar Jim Wallace
Browse files

Tidy up SNLP Protocols

parent bd7aeb1c
No related branches found
No related tags found
No related merge requests found
import Foundation
@available(macOS 10.15.0, *)
extension SNLPCorpus {
static func applyBasicTextProcessing(_ text: String, characterFilters: CharacterSet, tokenFilters: Set<String>) -> [String] {
let lowercasedText = text.lowercased()
// Remove characters contained in characterFilters
let wordsWithoutPunctuationAndNumbers = lowercasedText.components(separatedBy: characterFilters).joined().split(separator: " ")
// Remove tokens contained in tokenFilters
let wordsWithoutStopWords = wordsWithoutPunctuationAndNumbers.filter { !tokenFilters.contains(String($0)) }
return wordsWithoutStopWords.map { String($0) }
}
static func applyBasicTextProcessing(_ document: [String], characterFilters: CharacterSet, tokenFilters: Set<String>) -> [[String]] {
return document.map { applyBasicTextProcessing($0, characterFilters: characterFilters, tokenFilters: tokenFilters) }
}
mutating func addDocument(document: String) {
let processedDocument = Self.applyBasicTextProcessing(document, characterFilters: CharacterSet.punctuationCharacters, tokenFilters: [])
addDocument(document: processedDocument)
}
}
......@@ -4,39 +4,30 @@ import Foundation
protocol SNLPCorpus {
associatedtype DocumentEncoding
//var _dictionary: any SNLPDictionary { get set }
var encodedDocuments: [Int : DocumentEncoding] {get set}
var defaultDocumentEncoding: DocumentEncoding { get set }
mutating func addDocument(document: [String])
mutating func addDocuments(documents: [[String]])
static func applyBasicTextProcessing(_ text: String, characterFilters: CharacterSet, tokenFilters: Set<String>) -> [String]
static func applyBasicTextProcessing(_ document: [String], characterFilters: CharacterSet, tokenFilters: Set<String>) -> [[String]]
}
public enum CorpusOutputFormat {
//case Blei
case Text
}
@available(macOS 10.15.0, *)
extension SNLPCorpus {
subscript(_ index: Int) -> DocumentEncoding {
get {
return encodedDocuments[index]!
return encodedDocuments[index] ?? defaultDocumentEncoding
}
set(newValue) { // Should we provide a setter? is this useful?
encodedDocuments[index] = newValue
}
}
// TODO: Is there an easy way to provide a default implementation here? Would need to constrain DocumentEncoding to a Collection or Sequence at a minimum.
//mutating func addDocument(document: Document) {
// encodedDocuments[ encodedDocuments.count ] = dictionary.encode(document: document)
// TODO: Is there an easy way to provide a default implementation here?
//mutating func addDocument(document: [String]) {
//encodedDocuments[ encodedDocuments.count ] = DictionaryType.encode(document)
//}
mutating func addDocuments(documents: [[String]]) {
......@@ -50,27 +41,4 @@ extension SNLPCorpus {
addDocument(document: document)
}
}
static func applyBasicTextProcessing(_ text: String, characterFilters: CharacterSet, tokenFilters: Set<String>) -> [String] {
let lowercasedText = text.lowercased()
// Remove characters contained in characterFilters
let wordsWithoutPunctuationAndNumbers = lowercasedText.components(separatedBy: characterFilters).joined().split(separator: " ")
// Remove tokens contained in tokenFilters
let wordsWithoutStopWords = wordsWithoutPunctuationAndNumbers.filter { !tokenFilters.contains(String($0)) }
return wordsWithoutStopWords.map { String($0) }
}
static func applyBasicTextProcessing(_ document: [String], characterFilters: CharacterSet, tokenFilters: Set<String>) -> [[String]] {
return document.map { applyBasicTextProcessing($0, characterFilters: characterFilters, tokenFilters: tokenFilters) }
}
mutating func addDocument(document: String) {
let processedDocument = Self.applyBasicTextProcessing(document, characterFilters: CharacterSet.punctuationCharacters, tokenFilters: [])
addDocument(document: processedDocument)
}
}
......@@ -17,6 +17,10 @@ protocol SNLPDictionary : Codable {
// A dictionary that maps words/tokens in a corpora to numerical values or embeddings
var token2id: [Key: Value] { get set }
var count: Int { get }
var defaultTokenEncoding: Value { get set }
//func encode(token: String) -> Value
}
public enum DictionaryType {
......@@ -30,14 +34,18 @@ extension SNLPDictionary {
subscript(_ index: Key) -> Value {
get {
return token2id[index]!
return token2id[index, default: defaultTokenEncoding]
}
set(newValue) {
token2id[index] = newValue
}
}
func contains(key: Key) -> Bool {
// func encode(_ key: Key) -> Value {
// return self[key]
// }
func contains(_ key: Key) -> Bool {
return token2id.keys.contains(key)
}
......
......@@ -3,7 +3,7 @@ import Foundation
@available(macOS 10.15.0, *)
class BoWCorpus: SNLPCorpus {
typealias DocumentEncoding = [(Int, Int)]
......@@ -11,7 +11,7 @@ class BoWCorpus: SNLPCorpus {
Storage for our documents
*/
var encodedDocuments: [Int : DocumentEncoding]
var defaultDocumentEncoding: [(Int, Int)] = [(0,0)]
/*
Storage for Word -> ID mappings
......@@ -62,8 +62,8 @@ class BoWCorpus: SNLPCorpus {
self.tokenFilterSet = Set<String>()
}
}
}
func addDocument(document: [String]) {
// Construct a Dictionary containing the count of each token
var counter: [String : Int] = Dictionary()
......@@ -74,7 +74,7 @@ class BoWCorpus: SNLPCorpus {
// Fill in missing values in our Dictionary
if true {
let missing = counter.filter { !dictionary.contains(key: $0.key) }
let missing = counter.filter { !dictionary.contains($0.key) }
.sorted { $0.key < $1.key }
for (key, _) in missing {
......
......@@ -17,7 +17,7 @@ let basicStopwordSet: Set<String> = [
class BoWDictionary: SNLPDictionary {
typealias Key = String
typealias Value = Int
......@@ -25,6 +25,8 @@ class BoWDictionary: SNLPDictionary {
var token2id: [Key : Value]
var id2token: [Value: Key]
var defaultTokenEncoding: Int = 0
internal var cfs: [Value: Int]
internal var dfs: [Value: Int]
......@@ -35,7 +37,7 @@ class BoWDictionary: SNLPDictionary {
init() {
token2id = [Key: Value]()
id2token = [Value: Key]()
cfs = [Value: Int]()
dfs = [Value: Int]()
......
......@@ -12,26 +12,29 @@ import Surge
@available(macOS 10.15.0, *)
class KeyedVectorCorpus: SNLPCorpus {
typealias DocumentEncoding = [Double]
var encodedDocuments: [Int : [Double]] = [:]
var encodedDocuments: [Int : DocumentEncoding] = [:]
var defaultDocumentEncoding: [Double]
let width: Int
var dictionary: KeyedVectorDictionary
init(source: KeyedVectorDictionary.PreComputedEmbeddings) {
dictionary = KeyedVectorDictionary(source: source)
width = dictionary.width
defaultDocumentEncoding = Array(repeating: Double(0), count: width)
}
func addDocument(document: [String]) {
var encoding = DocumentEncoding(repeating: 0, count: dictionary.width)
var encoding = defaultDocumentEncoding
for word in document {
encoding .+= dictionary[word] // Surge in-place element-wise addition
}
......
......@@ -12,13 +12,14 @@ import SwiftAnnoy
@available(macOS 10.15, *)
class KeyedVectorDictionary: SNLPDictionary {
typealias Value = Vector
typealias Key = String
typealias Vector = [Double]
typealias Value = [Double]
var token2id: [Key : Vector]
var token2id: [Key : Value]
let width: Int
var defaultTokenEncoding: [Double]
var count: Int { token2id.count }
public enum PreComputedEmbeddings {
......@@ -45,6 +46,8 @@ class KeyedVectorDictionary: SNLPDictionary {
dictionaryToLoad = "glove.6B.100d"
}
defaultTokenEncoding = Array(repeating: Double(0), count: width)
// Try to load locally first
guard let url = Bundle.module.url(forResource: dictionaryToLoad, withExtension: "mmap") else {
debugPrint("File not found in bundle: \(dictionaryToLoad)")
......@@ -53,15 +56,12 @@ class KeyedVectorDictionary: SNLPDictionary {
return
}
token2id = KeyedVectorDictionary.readDictionaryFromFile(url)
//debugPrint("Loaded \(token2id.count) vectors from file")
//debugPrint("")
}
subscript(_ index: Key) -> Value {
get {
return token2id[index] ?? Array(repeating: Double(0), count: width)
return token2id[index] ?? defaultTokenEncoding
}
set(newValue) {
token2id[index] = newValue
......@@ -71,7 +71,7 @@ class KeyedVectorDictionary: SNLPDictionary {
// These use memory mapping to load the values in more quickly
// TODO: Validate that this actually works on other systems... could easily be some issues
static func writeDictionaryToFile(url: URL, dictionary: [Key : Vector]) {
static func writeDictionaryToFile(url: URL, dictionary: [Key : Value]) {
let fileManager = FileManager.default
if !fileManager.fileExists(atPath: url.path) {
......@@ -106,10 +106,10 @@ class KeyedVectorDictionary: SNLPDictionary {
// These use memory mapping to load the values in more quickly
// TODO: Validate that this actually works on other systems... could easily be some issues
static func readDictionaryFromFile(_ url: URL) -> [Key : Vector] {
static func readDictionaryFromFile(_ url: URL) -> [Key : Value] {
//let fileURL = URL(fileURLWithPath: filename)
var result: [Key : Vector]
var result: [Key : Value]
do {
let data = try Data(contentsOf: url, options: .alwaysMapped)
......@@ -120,7 +120,7 @@ class KeyedVectorDictionary: SNLPDictionary {
var index = MemoryLayout<Int>.size
// Initialize the dictionary with the count
result = [Key : Vector](minimumCapacity: count)
result = [Key : Value](minimumCapacity: count)
debugPrint("Loading Dictionary with \(count) items from file.")
while index < data.count {
......@@ -151,18 +151,18 @@ class KeyedVectorDictionary: SNLPDictionary {
print("Error reading dictionary from file: \(error)")
}
return [Key : Vector]()
return [Key : Value]()
}
// This is the slow way, don't use this for interactive code ...
// Better to use memory mapped loaders above
static func readDictionaryFromTextFile(from url: URL) -> [Key: Vector]? {
static func readDictionaryFromTextFile(from url: URL) -> [Key: Value]? {
do {
let content = try String(contentsOf: url, encoding: .utf8)
let lines = content.split(separator: "\n")
var data: [Key: Vector] = [:]
var data: [Key: Value] = [:]
for line in lines.dropFirst() {
let tokens = line.split(separator: " ")
......
......@@ -12,11 +12,12 @@ import Foundation
@available(macOS 10.15.0, *)
class TruncatedKeyedVectorCorpus : SNLPCorpus {
typealias DocumentEncoding = [Double]
var encodedDocuments: [Int : DocumentEncoding] = [:]
var defaultDocumentEncoding: [Double]
private let _corpus: KeyedVectorCorpus // reference to the corpus we want to copy
......@@ -34,6 +35,8 @@ class TruncatedKeyedVectorCorpus : SNLPCorpus {
for document in _corpus.encodedDocuments {
encodedDocuments[document.key] = Array(document.value.prefix(upTo: d))
}
defaultDocumentEncoding = Array(repeating: Double(0), count: d)
}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment