Tidy up test cases

0c406b9f · Jim Wallace · 3547ef46 · 0c406b9f · 0c406b9f · 0c406b9f
Commit 0c406b9f authored 1 year ago by Jim Wallace
--- a/Package.swift
+++ b/Package.swift
@@ -6,7 +6,7 @@ import PackageDescription
 let package = Package(
    name: "SwiftNLP",
    platforms: [
-            .macOS(.v12),
+            .macOS(.v13),
    ],
    products: [
        .library(
@@ -24,6 +24,7 @@ let package = Package(
        .package(url: "https://github.com/dclelland/Plinth", from: "2.0.0"),
        .package(url: "https://github.com/ryan-lam/nifty", branch: "master"),
        .package(url: "https://github.com/nifty-swift/Nifty-libs.git", from: "1.0.0"),
+        //.package(url: "https://github.com/ordo-one/package-benchmark", .upToNextMajor(from: "1.0.0")),
        //.package(url: "https://github.com/jjjkkkjjj/Matft", from: "0.3.3"),
    ],
    targets: [

--- a/Sources/SwiftNLP/1. Data Collection/20 Newsgroups.swift
+++ b/Sources/SwiftNLP/1. Data Collection/20 Newsgroups.swift
@@ -8,7 +8,7 @@
 import Foundation
 import SWCompression

-
+@inlinable
 func downloadData(from url: URL) async throws -> Data {
    let (data, _) = try await URLSession.shared.data(from: url)
    return data
@@ -35,7 +35,6 @@ func download20Newsgroups() async -> [String] {
    }
    
    let tarData = try? await result.value
-    //return tarData!
    
    var newsgroupData: [String] = [String]()
    newsgroupData.reserveCapacity(tarData!.count)

--- a/Sources/SwiftNLP/2. Embeddings/BoWDictionary.swift
+++ b/Sources/SwiftNLP/2. Embeddings/BoWDictionary.swift
@@ -11,11 +11,6 @@ let basicStopwordSet: Set<String> = [
 "all", "six", "just", "less", "being", "indeed", "over", "move", "anyway", "four", "not", "own", "through", "using", "fifty", "where", "mill", "only", "find", "before", "one", "whose", "system", "how", "somewhere", "much", "thick", "show", "had", "enough", "should", "to", "must", "whom", "seeming", "yourselves", "under", "ours", "two", "has", "might", "thereafter", "latterly", "do", "them", "his", "around", "than", "get", "very", "de", "none", "cannot", "every", "un", "they", "front", "during", "thus", "now", "him", "nor", "name", "regarding", "several", "hereafter", "did", "always", "who", "didn", "whither", "this", "someone", "either", "each", "become", "thereupon", "sometime", "side", "towards", "therein", "twelve", "because", "often", "ten", "our", "doing", "km", "eg", "some", "back", "used", "up", "go", "namely", "computer", "are", "further", "beyond", "ourselves", "yet", "out", "even", "will", "what", "still", "for", "bottom", "mine", "since", "please", "forty", "per", "its", "everything", "behind", "does", "various", "above", "between", "it", "neither", "seemed", "ever", "across", "she", "somehow", "be", "we", "full", "never", "sixty", "however", "here", "otherwise", "were", "whereupon", "nowhere", "although", "found", "alone", "re", "along", "quite", "fifteen", "by", "both", "about", "last", "would", "anything", "via", "many", "could", "thence", "put", "against", "keep", "etc", "amount", "became", "ltd", "hence", "onto", "or", "con", "among", "already", "co", "afterwards", "formerly", "within", "seems", "into", "others", "while", "whatever", "except", "down", "hers", "everyone", "done", "least", "another", "whoever", "moreover", "couldnt", "throughout", "anyhow", "yourself", "three", "from", "her", "few", "together", "top", "there", "due", "been", "next", "anyone", "eleven", "cry", "call", "therefore", "interest", "then", "thru", "themselves", "hundred", "really", "sincere", "empty", "more", "himself", "elsewhere", "mostly", "on", "fire", "am", "becoming", "hereby", "amongst", "else", "amongst", "else", "part", "everywhere", "too", "kg", "herself", "former", "those", "he", "me", "myself", "made", "twenty", "these", "was", "bill", "cant", "us", "until", "besides", "nevertheless", "below", "anywhere", "nine", "can", "whether", "of", "your", "toward", "my", "say", "something", "and", "whereafter", "whenever", "give", "almost", "wherever", "is", "describe", "beforehand", "herein", "doesn", "an", "as", "itself", "at", "have", "in", "seem", "whence", "ie", "any", "fill", "again", "hasnt", "inc", "thereby", "thin", "no", "perhaps", "latter", "meanwhile", "when", "detail", "same", "wherein", "beside", "also", "that", "other", "take", "which", "becomes", "you", "if", "nobody", "unless", "whereas", "see", "though", "may", "after", "upon", "most", "hereupon", "eight", "but", "serious", "nothing", "such", "why", "off", "a", "don", "whereby", "third", "i", "whole", "noone", "sometimes", "well", "amoungst", "yours", "their", "rather", "without", "so", "five", "the", "first", "with", "make", "once"
 ]

-
-//public typealias Topic = [(word: Word, probability: Double)]
-//public typealias TopicDistribution = [Topic]
-
-
 class BoWDictionary: SNLPDictionary {
                                       
    typealias Key = String

--- a/Sources/SwiftNLP/2. Embeddings/KeyedVectorCorpus.swift
+++ b/Sources/SwiftNLP/2. Embeddings/KeyedVectorCorpus.swift
@@ -8,6 +8,8 @@
 import Foundation
 #if canImport(Surge)
 import Surge
+#else
+//TODO: Implement Linux alternative with better performance, currently uses naive solution.
 #endif

 class KeyedVectorCorpus: SNLPCorpus {

--- a/Sources/SwiftNLP/2. Embeddings/KeyedVectorDictionary.swift
+++ b/Sources/SwiftNLP/2. Embeddings/KeyedVectorDictionary.swift
@@ -102,7 +102,7 @@ class KeyedVectorDictionary: SNLPDictionary {
            }
            
            fileHandle.closeFile()
-            print("Dictionary successfully written to file.")
+            //print("Dictionary successfully written to file.")
        } catch {
            print("Error writing dictionary to file: \(error)")
        }
@@ -125,7 +125,7 @@ class KeyedVectorDictionary: SNLPDictionary {

            // Initialize the dictionary with the count
            result = [Key : Value](minimumCapacity: count)
-            debugPrint("Loading Dictionary with \(count) items from file.")
+            //debugPrint("Loading Dictionary with \(count) items from file.")
            
            while index < data.count {
                // Read the key
@@ -149,7 +149,7 @@ class KeyedVectorDictionary: SNLPDictionary {
                }
            }
            
-            print("Successfully read dictionary from file.")
+            //print("Successfully read dictionary from file.")
            return result
        } catch {
            print("Error reading dictionary from file: \(error)")

--- a/Sources/SwiftNLP/2. Embeddings/SIMDCorpus.swift
+++ b/Sources/SwiftNLP/2. Embeddings/SIMDCorpus.swift
@@ -25,15 +25,13 @@ class SIMDCorpus<DocumentEncoding: SIMD>: SNLPCorpus where DocumentEncoding.Scal
        for document in corpus.encodedDocuments {
            encodedDocuments[document.key] = DocumentEncoding(document.value.prefix(DocumentEncoding.scalarCount))
        }
-        
-        
    }
    
    // TODO: Required by protocol, but not clear what should make sense ... maybe we implement our own?
    func addDocument(document: [String]) {
        // Do nothing ... ???
        // Is this awkwardness enough of a reason to create a SNLPReducedCorpus protocol?
-        print("TruncatedKeyedVectorCoprus: addDocument(\(document)")
+        //print("TruncatedKeyedVectorCoprus: addDocument(\(document)")
        //encodedDocuments[ encodedDocuments.count ] = DocumentEncoding( _dictionary. )
    }
    

--- a/Tests/SwiftNLPTests/SNLPBoWDictionaryTests.swift
+++ b/Tests/SwiftNLPTests/SNLPBoWDictionaryTests.swift
@@ -53,9 +53,26 @@ final class SwiftNLPBoWDictionaryTests: XCTestCase {
        let dictionary: BoWDictionary = corpus.dictionary
        
        XCTAssertEqual(dictionary.numDocs, 20)
-        //XCTAssertEqual(dictionary.numPos, 188)
-        //XCTAssertEqual(dictionary.numNNZ, 178)
+    }
+    
+    func testDocumentReading() throws {
+        // loads all json data for test documents
+        let redditCommentJson = TestUtils.loadAllRedditComment()
+        let redditSubmissionJson = TestUtils.loadAllRedditSubmission()
        
+        let redditComments = redditCommentJson.compactMap { readRedditCommentJson(json: $0) }
+        let redditSubmissions = redditSubmissionJson.compactMap { readRedditSubmissionJson(json: $0) }
+
+        // Extract body and selftext from each post, and store that for our corpus
+        let bodies = redditComments.flatMap { $0.posts.compactMap { $0.body } } +
+                     redditSubmissions.flatMap { $0.posts.compactMap { $0.selftext } }
+
+        // Add documents to corpus
+        var corpus = BoWCorpus()
+        corpus.addDocuments(documents: bodies)
+                
+        //print(corpus.encodedDocuments.count)
+        XCTAssert(corpus.encodedDocuments.count == 28837) //TODO: Confirm this number
    }
    
 }
--- a/Tests/SwiftNLPTests/SNLPKeyedVectorDictionaryTests.swift
+++ b/Tests/SwiftNLPTests/SNLPKeyedVectorDictionaryTests.swift
@@ -12,17 +12,10 @@ final class SwiftNLPKeyedvectorDictionaryTests: XCTestCase {
            "that enable us to train deep learning algorithms to learn like the human brain."
         ]
        
-        let startTime = DispatchTime.now()
        var corpus = KeyedVectorCorpus(source: .glove6B50d)
-        let stopTime = DispatchTime.now()
-        let totalTime = Double(stopTime.uptimeNanoseconds - startTime.uptimeNanoseconds) / 1_000_000_000
-        print("Initialized corpus in \(totalTime) seconds")
-        
        corpus.addDocuments(documents: docs)
-        print(corpus.encodedDocuments)
        
-        let dictionary = corpus.dictionary
-        print(dictionary["the"])
+        XCTAssert(corpus.encodedDocuments.count == 3)
    }
    
 }
--- a/Tests/SwiftNLPTests/SNLPLoadDataTests.swift
+++ b/Tests/SwiftNLPTests/SNLPLoadDataTests.swift
@@ -8,15 +8,7 @@ import XCTest
 @testable import SwiftNLP

 final class SwiftNLPLoadDataTests: XCTestCase {
-    
-    // test fetching names of all the files
-    func testFileNameFetching() throws {
-        let redditCommentNames = TestUtils.getJsonFiles(prefix: "RC")
-        print("reddit comment files: \(redditCommentNames)")
-        let redditSubmissionNames = TestUtils.getJsonFiles(prefix: "RS")
-        print("reddit submission files: \(redditSubmissionNames)")
-    }
-    
+        
    // test reading reddit submission json files into actual objects
    func testRedditSubmissions() throws {
        let redditSubmissionJson = TestUtils.loadAllRedditSubmission()
@@ -35,17 +27,16 @@ final class SwiftNLPLoadDataTests: XCTestCase {
        }
    }
    
-    func test20kDownload() async throws {
-        
-        let result = try await downloadSubredditFromServer(subreddit: "StopGaming")
-        print("Loaded \(result.count) threads from server.")
-        if let random = result.randomElement() {
-            let (key, value) = random
-            print("Key: \(key), Value: \(value)")
-        }
-        XCTAssertEqual(result.count, 34829, "Failed to load subreddit data from https://reddit-top20k.cworld.ai")
-        
-    }
+//    func test20kDownload() async throws {
+//        
+//        let result = try await downloadSubredditFromServer(subreddit: "StopGaming")
+//        print("Loaded \(result.count) threads from server.")
+//        if let random = result.randomElement() {
+//            let (key, value) = random
+//            print("Key: \(key), Value: \(value)")
+//        }
+//        XCTAssertEqual(result.count, 34829, "Failed to load subreddit data from https://reddit-top20k.cworld.ai")
+//    }
    
    
    func testDocumentReading() throws {
@@ -53,61 +44,28 @@ final class SwiftNLPLoadDataTests: XCTestCase {
        let redditCommentJson = TestUtils.loadAllRedditComment()
        let redditSubmissionJson = TestUtils.loadAllRedditSubmission()
        
-        let redditComments = redditCommentJson.compactMap { readRedditCommentJson(json: $0)}
+        let redditComments = redditCommentJson.compactMap { readRedditCommentJson(json: $0) }
        let redditSubmissions = redditSubmissionJson.compactMap { readRedditSubmissionJson(json: $0) }
-        
-        var bodies: [String] = []
-        
-        // load all the reddit comments' body as comment to the document
-        for comment in redditComments {
-            //debugPrint("Processing \(comment.posts.count) comments")
-            
-            for post in comment.posts {
-                if let body = post.body {
-                    bodies.append(body)
-                }
-            }
-        }
-        
-        for submission in redditSubmissions {
-            //debugPrint("Processing \(submission.posts.count) submissions")
-            
-            for post in submission.posts {
-                if let p = post.selftext {
-                    //debugPrint(p)
-                    bodies.append(p)
-                }
-            }
-        }
-        
-        // Debug code
-        //bodies = Array(bodies.prefix(10))
-        
-        
+
+        // Extract body and selftext from each post, and store that for our corpus
+        let bodies = redditComments.flatMap { $0.posts.compactMap { $0.body } } +
+                     redditSubmissions.flatMap { $0.posts.compactMap { $0.selftext } }
+
        // Add documents to corpus
        var corpus = KeyedVectorCorpus(source: .glove6B50d)
        corpus.addDocuments(documents: bodies)
-        
-//        for comment in corpus.encodedDocuments {
-//            debugPrint(comment.value)
-//        }
-        //debugPrint("Encoded \(corpus.encodedDocuments.count) comments and submissions")
-        //debugPrint("Original: \(corpus[0])")
+                
+        //print(corpus.encodedDocuments.count)
+        XCTAssert(corpus.encodedDocuments.count == 28765)
        
        // Dimensionality reduction
        let truncatedCorpus = SIMDCorpus<SIMD2<Double>>(corpus)
-        debugPrint("TruncatedCorpus with \(truncatedCorpus.encodedDocuments.count) documents created")
-        //debugPrint("Truncated: \(truncatedCorpus.encodedDocuments[0]!)")
-//        for comment in truncatedCorpus.encodedDocuments {
-//            debugPrint(comment.value)
-//        }
-        //truncatedCorpus.clusterWithDBSCAN(epsilon: 0.1, minimumNumberofPointsInDenseRegion: 20)
-        //truncatedCorpus.clusterWithKMeans()
+        //debugPrint("TruncatedCorpus with \(truncatedCorpus.encodedDocuments.count) documents created")
        
        // Clustering / Topic Detection
        var topics = StubTopicModel<[Double]>(numberOfTopics: 3)
        topics.train(truncatedCorpus)
        
-        debugPrint(topics)
+        //debugPrint(topics)
    }
 }
--- a/Tests/SwiftNLPTests/SNLPWebCorpusTextx.swift
+++ b/Tests/SwiftNLPTests/SNLPWebCorpusTextx.swift
-//import XCTest
-//@testable import SwiftNLP
-//
-//final class SwiftNLPWebCorpusTests: XCTestCase {
-//
-//    func test20Newsgroups() throws {
-//
-//
-//        let urlString = "http://qwone.com/~jason/20Newsgroups/20news-18828.tar.gz"
-//        let wc = SNLPWebCorpus()
-//        
-//        print("START")
-//
-//        if let url = URL(string: urlString) {
-//            wc.downloadAndUncompressTarGzFile(url: url) { result in
-//                switch result {
-//                case .success(let extractedFiles):
-//                    for (filename, fileData) in extractedFiles {
-//                        print("Filename: \(filename), Data size: \(fileData.count) bytes")
-//                    }
-//                case .failure(let error):
-//                    print("Error uncompressing .tar.gz file: \(error.localizedDescription)")
-//                }
-//            }
-//        } else {
-//            print("Invalid URL")
-//        }
-//
-//        print("END")
-//    }
-//
-//}
--- a/Tests/SwiftNLPTests/SwiftAnnoyTest.swift
+++ b/Tests/SwiftNLPTests/SwiftAnnoyTest.swift
@@ -23,11 +23,11 @@ final class SwiftAnnoyTest: XCTestCase {
            "that enable us to train deep learning algorithms to learn like the human brain."
        ]
        
-        let startTime = DispatchTime.now()
+        //let startTime = DispatchTime.now()
        var corpus = KeyedVectorCorpus(source: .glove6B50d)
-        let stopTime = DispatchTime.now()
-        let totalTime = Double(stopTime.uptimeNanoseconds - startTime.uptimeNanoseconds) / 1_000_000_000
-        print("Initialized corpus in \(totalTime) seconds")
+        //let stopTime = DispatchTime.now()
+        //let totalTime = Double(stopTime.uptimeNanoseconds - startTime.uptimeNanoseconds) / 1_000_000_000
+        //print("Initialized corpus in \(totalTime) seconds")
        
        corpus.addDocuments(documents: docs)
        print(corpus.encodedDocuments)
@@ -46,8 +46,8 @@ final class SwiftAnnoyTest: XCTestCase {

        let (ids, distances) = myIndex.getNNsForVector(vector: &frog, neighbors: 10)!

-        for (id, distance) in zip(ids, distances) {
-            debugPrint("\(myMap[id]!) was \(distance)")
-        }
+//        for (id, distance) in zip(ids, distances) {
+//            debugPrint("\(myMap[id]!) was \(distance)")
+//        }
    }
 }