Skip to content
Snippets Groups Projects
Commit 0c406b9f authored by Jim Wallace's avatar Jim Wallace
Browse files

Tidy up test cases

parent 3547ef46
No related branches found
No related tags found
No related merge requests found
Pipeline #108191 passed
......@@ -6,7 +6,7 @@ import PackageDescription
let package = Package(
name: "SwiftNLP",
platforms: [
.macOS(.v12),
.macOS(.v13),
],
products: [
.library(
......@@ -24,6 +24,7 @@ let package = Package(
.package(url: "https://github.com/dclelland/Plinth", from: "2.0.0"),
.package(url: "https://github.com/ryan-lam/nifty", branch: "master"),
.package(url: "https://github.com/nifty-swift/Nifty-libs.git", from: "1.0.0"),
//.package(url: "https://github.com/ordo-one/package-benchmark", .upToNextMajor(from: "1.0.0")),
//.package(url: "https://github.com/jjjkkkjjj/Matft", from: "0.3.3"),
],
targets: [
......
......@@ -8,7 +8,7 @@
import Foundation
import SWCompression
@inlinable
func downloadData(from url: URL) async throws -> Data {
let (data, _) = try await URLSession.shared.data(from: url)
return data
......@@ -35,7 +35,6 @@ func download20Newsgroups() async -> [String] {
}
let tarData = try? await result.value
//return tarData!
var newsgroupData: [String] = [String]()
newsgroupData.reserveCapacity(tarData!.count)
......
......@@ -11,11 +11,6 @@ let basicStopwordSet: Set<String> = [
"all", "six", "just", "less", "being", "indeed", "over", "move", "anyway", "four", "not", "own", "through", "using", "fifty", "where", "mill", "only", "find", "before", "one", "whose", "system", "how", "somewhere", "much", "thick", "show", "had", "enough", "should", "to", "must", "whom", "seeming", "yourselves", "under", "ours", "two", "has", "might", "thereafter", "latterly", "do", "them", "his", "around", "than", "get", "very", "de", "none", "cannot", "every", "un", "they", "front", "during", "thus", "now", "him", "nor", "name", "regarding", "several", "hereafter", "did", "always", "who", "didn", "whither", "this", "someone", "either", "each", "become", "thereupon", "sometime", "side", "towards", "therein", "twelve", "because", "often", "ten", "our", "doing", "km", "eg", "some", "back", "used", "up", "go", "namely", "computer", "are", "further", "beyond", "ourselves", "yet", "out", "even", "will", "what", "still", "for", "bottom", "mine", "since", "please", "forty", "per", "its", "everything", "behind", "does", "various", "above", "between", "it", "neither", "seemed", "ever", "across", "she", "somehow", "be", "we", "full", "never", "sixty", "however", "here", "otherwise", "were", "whereupon", "nowhere", "although", "found", "alone", "re", "along", "quite", "fifteen", "by", "both", "about", "last", "would", "anything", "via", "many", "could", "thence", "put", "against", "keep", "etc", "amount", "became", "ltd", "hence", "onto", "or", "con", "among", "already", "co", "afterwards", "formerly", "within", "seems", "into", "others", "while", "whatever", "except", "down", "hers", "everyone", "done", "least", "another", "whoever", "moreover", "couldnt", "throughout", "anyhow", "yourself", "three", "from", "her", "few", "together", "top", "there", "due", "been", "next", "anyone", "eleven", "cry", "call", "therefore", "interest", "then", "thru", "themselves", "hundred", "really", "sincere", "empty", "more", "himself", "elsewhere", "mostly", "on", "fire", "am", "becoming", "hereby", "amongst", "else", "amongst", "else", "part", "everywhere", "too", "kg", "herself", "former", "those", "he", "me", "myself", "made", "twenty", "these", "was", "bill", "cant", "us", "until", "besides", "nevertheless", "below", "anywhere", "nine", "can", "whether", "of", "your", "toward", "my", "say", "something", "and", "whereafter", "whenever", "give", "almost", "wherever", "is", "describe", "beforehand", "herein", "doesn", "an", "as", "itself", "at", "have", "in", "seem", "whence", "ie", "any", "fill", "again", "hasnt", "inc", "thereby", "thin", "no", "perhaps", "latter", "meanwhile", "when", "detail", "same", "wherein", "beside", "also", "that", "other", "take", "which", "becomes", "you", "if", "nobody", "unless", "whereas", "see", "though", "may", "after", "upon", "most", "hereupon", "eight", "but", "serious", "nothing", "such", "why", "off", "a", "don", "whereby", "third", "i", "whole", "noone", "sometimes", "well", "amoungst", "yours", "their", "rather", "without", "so", "five", "the", "first", "with", "make", "once"
]
//public typealias Topic = [(word: Word, probability: Double)]
//public typealias TopicDistribution = [Topic]
class BoWDictionary: SNLPDictionary {
typealias Key = String
......
......@@ -8,6 +8,8 @@
import Foundation
#if canImport(Surge)
import Surge
#else
//TODO: Implement Linux alternative with better performance, currently uses naive solution.
#endif
class KeyedVectorCorpus: SNLPCorpus {
......
......@@ -102,7 +102,7 @@ class KeyedVectorDictionary: SNLPDictionary {
}
fileHandle.closeFile()
print("Dictionary successfully written to file.")
//print("Dictionary successfully written to file.")
} catch {
print("Error writing dictionary to file: \(error)")
}
......@@ -125,7 +125,7 @@ class KeyedVectorDictionary: SNLPDictionary {
// Initialize the dictionary with the count
result = [Key : Value](minimumCapacity: count)
debugPrint("Loading Dictionary with \(count) items from file.")
//debugPrint("Loading Dictionary with \(count) items from file.")
while index < data.count {
// Read the key
......@@ -149,7 +149,7 @@ class KeyedVectorDictionary: SNLPDictionary {
}
}
print("Successfully read dictionary from file.")
//print("Successfully read dictionary from file.")
return result
} catch {
print("Error reading dictionary from file: \(error)")
......
......@@ -25,15 +25,13 @@ class SIMDCorpus<DocumentEncoding: SIMD>: SNLPCorpus where DocumentEncoding.Scal
for document in corpus.encodedDocuments {
encodedDocuments[document.key] = DocumentEncoding(document.value.prefix(DocumentEncoding.scalarCount))
}
}
// TODO: Required by protocol, but not clear what should make sense ... maybe we implement our own?
func addDocument(document: [String]) {
// Do nothing ... ???
// Is this awkwardness enough of a reason to create a SNLPReducedCorpus protocol?
print("TruncatedKeyedVectorCoprus: addDocument(\(document)")
//print("TruncatedKeyedVectorCoprus: addDocument(\(document)")
//encodedDocuments[ encodedDocuments.count ] = DocumentEncoding( _dictionary. )
}
......
......@@ -53,9 +53,26 @@ final class SwiftNLPBoWDictionaryTests: XCTestCase {
let dictionary: BoWDictionary = corpus.dictionary
XCTAssertEqual(dictionary.numDocs, 20)
//XCTAssertEqual(dictionary.numPos, 188)
//XCTAssertEqual(dictionary.numNNZ, 178)
}
func testDocumentReading() throws {
// loads all json data for test documents
let redditCommentJson = TestUtils.loadAllRedditComment()
let redditSubmissionJson = TestUtils.loadAllRedditSubmission()
let redditComments = redditCommentJson.compactMap { readRedditCommentJson(json: $0) }
let redditSubmissions = redditSubmissionJson.compactMap { readRedditSubmissionJson(json: $0) }
// Extract body and selftext from each post, and store that for our corpus
let bodies = redditComments.flatMap { $0.posts.compactMap { $0.body } } +
redditSubmissions.flatMap { $0.posts.compactMap { $0.selftext } }
// Add documents to corpus
var corpus = BoWCorpus()
corpus.addDocuments(documents: bodies)
//print(corpus.encodedDocuments.count)
XCTAssert(corpus.encodedDocuments.count == 28837) //TODO: Confirm this number
}
}
......@@ -12,17 +12,10 @@ final class SwiftNLPKeyedvectorDictionaryTests: XCTestCase {
"that enable us to train deep learning algorithms to learn like the human brain."
]
let startTime = DispatchTime.now()
var corpus = KeyedVectorCorpus(source: .glove6B50d)
let stopTime = DispatchTime.now()
let totalTime = Double(stopTime.uptimeNanoseconds - startTime.uptimeNanoseconds) / 1_000_000_000
print("Initialized corpus in \(totalTime) seconds")
corpus.addDocuments(documents: docs)
print(corpus.encodedDocuments)
let dictionary = corpus.dictionary
print(dictionary["the"])
XCTAssert(corpus.encodedDocuments.count == 3)
}
}
......@@ -8,15 +8,7 @@ import XCTest
@testable import SwiftNLP
final class SwiftNLPLoadDataTests: XCTestCase {
// test fetching names of all the files
func testFileNameFetching() throws {
let redditCommentNames = TestUtils.getJsonFiles(prefix: "RC")
print("reddit comment files: \(redditCommentNames)")
let redditSubmissionNames = TestUtils.getJsonFiles(prefix: "RS")
print("reddit submission files: \(redditSubmissionNames)")
}
// test reading reddit submission json files into actual objects
func testRedditSubmissions() throws {
let redditSubmissionJson = TestUtils.loadAllRedditSubmission()
......@@ -35,17 +27,16 @@ final class SwiftNLPLoadDataTests: XCTestCase {
}
}
func test20kDownload() async throws {
let result = try await downloadSubredditFromServer(subreddit: "StopGaming")
print("Loaded \(result.count) threads from server.")
if let random = result.randomElement() {
let (key, value) = random
print("Key: \(key), Value: \(value)")
}
XCTAssertEqual(result.count, 34829, "Failed to load subreddit data from https://reddit-top20k.cworld.ai")
}
// func test20kDownload() async throws {
//
// let result = try await downloadSubredditFromServer(subreddit: "StopGaming")
// print("Loaded \(result.count) threads from server.")
// if let random = result.randomElement() {
// let (key, value) = random
// print("Key: \(key), Value: \(value)")
// }
// XCTAssertEqual(result.count, 34829, "Failed to load subreddit data from https://reddit-top20k.cworld.ai")
// }
func testDocumentReading() throws {
......@@ -53,61 +44,28 @@ final class SwiftNLPLoadDataTests: XCTestCase {
let redditCommentJson = TestUtils.loadAllRedditComment()
let redditSubmissionJson = TestUtils.loadAllRedditSubmission()
let redditComments = redditCommentJson.compactMap { readRedditCommentJson(json: $0)}
let redditComments = redditCommentJson.compactMap { readRedditCommentJson(json: $0) }
let redditSubmissions = redditSubmissionJson.compactMap { readRedditSubmissionJson(json: $0) }
var bodies: [String] = []
// load all the reddit comments' body as comment to the document
for comment in redditComments {
//debugPrint("Processing \(comment.posts.count) comments")
for post in comment.posts {
if let body = post.body {
bodies.append(body)
}
}
}
for submission in redditSubmissions {
//debugPrint("Processing \(submission.posts.count) submissions")
for post in submission.posts {
if let p = post.selftext {
//debugPrint(p)
bodies.append(p)
}
}
}
// Debug code
//bodies = Array(bodies.prefix(10))
// Extract body and selftext from each post, and store that for our corpus
let bodies = redditComments.flatMap { $0.posts.compactMap { $0.body } } +
redditSubmissions.flatMap { $0.posts.compactMap { $0.selftext } }
// Add documents to corpus
var corpus = KeyedVectorCorpus(source: .glove6B50d)
corpus.addDocuments(documents: bodies)
// for comment in corpus.encodedDocuments {
// debugPrint(comment.value)
// }
//debugPrint("Encoded \(corpus.encodedDocuments.count) comments and submissions")
//debugPrint("Original: \(corpus[0])")
//print(corpus.encodedDocuments.count)
XCTAssert(corpus.encodedDocuments.count == 28765)
// Dimensionality reduction
let truncatedCorpus = SIMDCorpus<SIMD2<Double>>(corpus)
debugPrint("TruncatedCorpus with \(truncatedCorpus.encodedDocuments.count) documents created")
//debugPrint("Truncated: \(truncatedCorpus.encodedDocuments[0]!)")
// for comment in truncatedCorpus.encodedDocuments {
// debugPrint(comment.value)
// }
//truncatedCorpus.clusterWithDBSCAN(epsilon: 0.1, minimumNumberofPointsInDenseRegion: 20)
//truncatedCorpus.clusterWithKMeans()
//debugPrint("TruncatedCorpus with \(truncatedCorpus.encodedDocuments.count) documents created")
// Clustering / Topic Detection
var topics = StubTopicModel<[Double]>(numberOfTopics: 3)
topics.train(truncatedCorpus)
debugPrint(topics)
//debugPrint(topics)
}
}
//import XCTest
//@testable import SwiftNLP
//
//final class SwiftNLPWebCorpusTests: XCTestCase {
//
// func test20Newsgroups() throws {
//
//
// let urlString = "http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz"
// let wc = SNLPWebCorpus()
//
// print("START")
//
// if let url = URL(string: urlString) {
// wc.downloadAndUncompressTarGzFile(url: url) { result in
// switch result {
// case .success(let extractedFiles):
// for (filename, fileData) in extractedFiles {
// print("Filename: \(filename), Data size: \(fileData.count) bytes")
// }
// case .failure(let error):
// print("Error uncompressing .tar.gz file: \(error.localizedDescription)")
// }
// }
// } else {
// print("Invalid URL")
// }
//
// print("END")
// }
//
//}
......@@ -23,11 +23,11 @@ final class SwiftAnnoyTest: XCTestCase {
"that enable us to train deep learning algorithms to learn like the human brain."
]
let startTime = DispatchTime.now()
//let startTime = DispatchTime.now()
var corpus = KeyedVectorCorpus(source: .glove6B50d)
let stopTime = DispatchTime.now()
let totalTime = Double(stopTime.uptimeNanoseconds - startTime.uptimeNanoseconds) / 1_000_000_000
print("Initialized corpus in \(totalTime) seconds")
//let stopTime = DispatchTime.now()
//let totalTime = Double(stopTime.uptimeNanoseconds - startTime.uptimeNanoseconds) / 1_000_000_000
//print("Initialized corpus in \(totalTime) seconds")
corpus.addDocuments(documents: docs)
print(corpus.encodedDocuments)
......@@ -46,8 +46,8 @@ final class SwiftAnnoyTest: XCTestCase {
let (ids, distances) = myIndex.getNNsForVector(vector: &frog, neighbors: 10)!
for (id, distance) in zip(ids, distances) {
debugPrint("\(myMap[id]!) was \(distance)")
}
// for (id, distance) in zip(ids, distances) {
// debugPrint("\(myMap[id]!) was \(distance)")
// }
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment