diff --git a/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift b/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift index da751bce2b98e4f6416515fed7331fad85c875d8..86c4ae46fe3b5bf8af549a23fb9d2044ccc1c9b4 100644 --- a/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift +++ b/Tests/SwiftNLPTests/1. Data Collection/HNSW/DurableHNSWCorpusTests.swift @@ -44,7 +44,7 @@ final class DurableHNSWCorpusTests: XCTestCase { // workingDirectoryPath = FilePath(resourcesDirectoryURL.path) } - func testBasicExample() throws { + func testBuildBasicCorpus() throws { let docs = [ "CNTK formerly known as Computational Network Toolkit", "is a free easy-to-use open-source commercial-grade toolkit", @@ -89,7 +89,7 @@ final class DurableHNSWCorpusTests: XCTestCase { /// This is because size is only incremented when insertion is called but it is not called when read from disk! } - func testBasicQueryExample() async throws { + func testQueryBasicCorpus() async throws { let docs = [ "The quick brown fox jumps over the lazy dog", "I enjoy taking long walks along the beach at sunset", @@ -155,7 +155,7 @@ final class DurableHNSWCorpusTests: XCTestCase { try transaction.commit() } - func testBuildSubredditCorpus() async throws { + func testBuildGuelphSubredditCorpus() async throws { /// Generates the LMDB durable storage to disk but runs no tests otherwise /// Setting up the environment @@ -198,7 +198,7 @@ final class DurableHNSWCorpusTests: XCTestCase { try transaction.commit() } - func testSubredditQueryExample() async throws { + func testQueryGuelphSubredditCorpus() async throws { let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d) /// Setting up the environment diff --git a/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift b/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift index 13618f4e9e7e1a03fdc8454b26b46fe06823ee23..0360e6b29c06a8cf4df0c236525e2ec69da3c6dd 100644 --- a/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift +++ b/Tests/SwiftNLPTests/1. Data Collection/HNSW/EphemeralHNSWCorpusTests.swift @@ -8,7 +8,7 @@ final class EphemeralHNSWCorpusTests: XCTestCase { // MARK: EphemeralHNSWCorpus can also be used as its typealias HNSWCorpus // Load a small set of documents and confirm that corpus and dictionary are updated accordingly - func testBasicExample() throws { + func testBuildBasicCorpus() throws { let docs = [ "CNTK formerly known as Computational Network Toolkit", "is a free easy-to-use open-source commercial-grade toolkit", @@ -27,7 +27,7 @@ final class EphemeralHNSWCorpusTests: XCTestCase { } // Load a bigger set of documents and confirm - func testLargeExample() throws { + func testBuildLargeCorpus() throws { let twentyQuotes = [ "Imagination is more important than knowledge. - Albert Einstein", "The greatest enemy of knowledge is not ignorance, it is the illusion of knowledge. - Stephen Hawking", @@ -62,7 +62,7 @@ final class EphemeralHNSWCorpusTests: XCTestCase { } } - func testSubreddit() async throws { + func testBuildGuelphSubredditCorpus() async throws { guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else { fatalError("Failed to find waterloo_submissions.zst in test bundle.") } @@ -84,7 +84,7 @@ final class EphemeralHNSWCorpusTests: XCTestCase { } // Load a small set of documents and confirm that corpus and dictionary are updated accordingly - func testBasicQueryExample() async throws { + func testQueryBasicCorpus() async throws { let docs = [ "The quick brown fox jumps over the lazy dog", "I enjoy taking long walks along the beach at sunset", @@ -114,7 +114,7 @@ final class EphemeralHNSWCorpusTests: XCTestCase { } } - func testLargeQueryExample() async throws { + func testQueryLargeCorpus() async throws { let docs = [ "Imagination is more important than knowledge. - Albert Einstein", "The greatest enemy of knowledge is not ignorance, it is the illusion of knowledge. - Stephen Hawking", @@ -156,7 +156,7 @@ final class EphemeralHNSWCorpusTests: XCTestCase { } } - func testSubredditQueryExample() async throws { + func testQueryGuephSubredditCorpus() async throws { guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else { fatalError("Failed to find waterloo_submissions.zst in test bundle.") } @@ -188,6 +188,35 @@ final class EphemeralHNSWCorpusTests: XCTestCase { print("Error when trying corpus.encodedDocuments.find(): \(error)") } } + + func testTypicalNeighborhoodSize() async throws { +// guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else { +// fatalError("Failed to find waterloo_submissions.zst in test bundle.") +// } +// guard let submissionsData = try? Data(contentsOf: submissionsURL) else { +// fatalError("Failed to load waterloo_submissions.zst from test bundle.") +// } +// +// let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData) +// +// let typicalNeighborhoodSizes = [2, 8, 16, 32, 64, 128, 512, 1028] +// +// for typicalNeighborhoodSize in typicalNeighborhoodSizes { +// let startTime = Date() +// var corpus = HNSWCorpus(encoding: .glove6B50d, typicalNeighborhoodSize: typicalNeighborhoodSize) +// +// for submission in submissions { +// if let text = submission.selftext { +// corpus.addUntokenizedDocument(text) +// } +// } +// +// XCTAssert(corpus.count == 17999) +// +// let endTime = Date() +// print("Typical neighborhood size \(typicalNeighborhoodSize) took \(endTime.timeIntervalSince(startTime)) seconds.") +// } + } } #endif