Skip to content
Snippets Groups Projects
Commit 7a816683 authored by Mingchung Xia's avatar Mingchung Xia
Browse files

Started HNSW testcases

parent 768b1fa4
No related branches found
No related tags found
1 merge request!13HNSW Implementation with Testcases
Pipeline #110532 failed
#if os(macOS)
import XCTest
import Foundation
@testable import SwiftNLP
final class HNSWTests: XCTestCase {
// Load a small set of documents and confirm that corpus and dictionary are updated accordingly
func testBasicExample() throws {
let docs = [
"CNTK formerly known as Computational Network Toolkit",
"is a free easy-to-use open-source commercial-grade toolkit",
"that enable us to train deep learning algorithms to learn like the human brain."
]
let encoder = ContextFreeEncoder<Double>(source: .glove6B50d)
var corpus = HNSWCorpus(_documentEncoder: encoder)
corpus.addUntokenizedDocuments(docs)
XCTAssert(corpus.encodedDocuments.base.vectors.count == 3)
// Make sure none of our encodings are zero
for c in corpus {
XCTAssertNotEqual(c, corpus.zeroes)
}
}
// Load a bigger set of documents and confirm that
func testBiggerExample() throws {
let twentyQuotes = [
"Imagination is more important than knowledge. - Albert Einstein",
"The greatest enemy of knowledge is not ignorance, it is the illusion of knowledge. - Stephen Hawking",
"If I have seen further it is by standing on the shoulders of giants. - Isaac Newton",
"The universe is a wondrous place! The faster you create unbreakable code, the faster the universe creates people that can break it. - Richard Feynman",
"Science is the belief in the ignorance of experts. - Richard Feynman",
"The saddest aspect of life right now is that science gathers knowledge faster than society gathers wisdom. - Isaac Asimov",
"Science is the poetry of reality. - Richard Dawkins",
"To raise new questions, new possibilities, to regard old problems from a new angle, requires creative imagination and marks real advance in science. - Albert Einstein",
"The scientist does not study nature because it is useful; he studies it because he delights in it, and he delights in it because it is beautiful. - Henri Poincaré",
"Nothing in life is to be feared, it is only to be understood. Now is the time to understand more, so that we may fear less. - Marie Curie",
"An experiment is a question which science poses to Nature, and a measurement is the recording of Nature’s answer. - Max Planck",
"If you wish to make an apple pie from scratch, you must first invent the universe. - Carl Sagan",
"The function of science fiction is not always to predict the future but sometimes to prevent it. - Frank Herbert",
"Science is what we understand well enough to explain to a computer. Art is everything else we do. - Donald Knuth",
"In science one tries to tell people, in such a way as to be understood by everyone, something that no one ever knew before. But in poetry, it's the exact opposite. - Paul Dirac",
"Science is a way of thinking much more than it is a body of knowledge. - Carl Sagan",
"Research is what I’m doing when I don’t know what I’m doing. - Wernher von Braun",
"The most beautiful thing we can experience is the mysterious. It is the source of all true art and science. - Albert Einstein",
"One, remember to look up at the stars and not down at your feet. Two, never give up work. Work gives you meaning and purpose and life is empty without it. Three, if you are lucky enough to find love, remember it is there and don't throw it away. - Stephen Hawking",
"All science is either physics or stamp collecting. - Ernest Rutherford"
]
let encoder = ContextFreeEncoder<Double>(source: .glove6B50d)
var corpus = HNSWCorpus(_documentEncoder: encoder)
corpus.addUntokenizedDocuments(twentyQuotes)
XCTAssertEqual(corpus.encodedDocuments.base.vectors.count, 20)
// Make sure none of our encodings are zero
for c in corpus {
XCTAssertNotEqual(c, corpus.zeroes)
}
}
func testSubreddit() async throws {
guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
fatalError("Failed to find waterloo_submissions.zst in test bundle.")
}
guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
fatalError("Failed to load waterloo_submissions.zst from test bundle.")
}
let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
let encoder = ContextFreeEncoder<Double>(source: .glove6B50d)
var corpus = HNSWCorpus(_documentEncoder: encoder)
for submission in submissions {
if let text = submission.selftext {
corpus.addUntokenizedDocument(text)
}
}
//print("Loaded \(corpus.encodedDocuments.count) documents.")
XCTAssert(corpus.encodedDocuments.base.vectors.count == 17999)
}
}
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment