Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • jrwallace/curio
1 result
Show changes
Showing
with 974 additions and 59 deletions
// Copyright (c) 2024 Jim Wallace
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// Created by Mingchung Xia on 2024-03-14.
//
import Foundation
import SimilarityMetric
#if canImport(Surge) && canImport(Accelerate) && os(macOS)
import Surge
import Accelerate
public struct CosineSimilarityMetric<Vector: Collection & Codable>: SimilarityMetric where Vector.Element: BinaryFloatingPoint {
public func similarity(between someItem: Vector, _ otherItem: Vector) -> Vector.Element {
// Convert vectors to arrays of Double
let someItemDoubles = someItem.map { Double($0) }
let otherItemDoubles = otherItem.map { Double($0) }
// Calculate dot product using Surge for cosine similarity numerator
let dotProduct = Surge.dot(someItemDoubles, otherItemDoubles)
// Manually calculate magnitudes (norms) of the vectors for the denominator
let someItemMagnitude = sqrt(Surge.dot(someItemDoubles, someItemDoubles))
let otherItemMagnitude = sqrt(Surge.dot(otherItemDoubles, otherItemDoubles))
// Calculate cosine similarity
let cosineSimilarity = dotProduct / (someItemMagnitude * otherItemMagnitude)
// Convert back to type Vector.Element
return Vector.Element(cosineSimilarity)
}
}
#else
//import Nifty
// MARK: Nifty is too outdated to retrofit our code, even after updating its swift-tools-version to 5.9.
/// This implementation may be less efficient on Linux
public struct CosineSimilarityMetric<Vector: Collection & Codable>: SimilarityMetric where Vector.Element: BinaryFloatingPoint {
public func similarity(between someItem: Vector, _ otherItem: Vector) -> Vector.Element {
let dotProduct = zip(someItem, otherItem).reduce(0) { $0 + $1.0 * $1.1 }
let magnitudeSomeItem = sqrt(someItem.reduce(0) { $0 + $1 * $1 })
let magnitudeOtherItem = sqrt(otherItem.reduce(0) { $0 + $1 * $1 })
let cosineSimilarity = dotProduct / (magnitudeSomeItem * magnitudeOtherItem)
return cosineSimilarity
}
}
#endif
// Copyright (c) 2024 Jim Wallace
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// See the https://github.com/JadenGeller/similarity-topology.git
// for reference. The code is used with permission from the author
// under the MIT License.
//
// Created by Mingchung Xia on 2024-01-28.
//
#if canImport(GameplayKit) && os(macOS)
import Foundation
import GameplayKit
// MARK: GameplayKit provides a mersenne twister for RNG, but is not available on Linux
// See https://github.com/quells/Squall package for alternative mersenne twister
@available(macOS, introduced: 10.11)
struct MersenneTwisterRNG: RandomNumberGenerator {
private let randomSource: GKMersenneTwisterRandomSource
init(seed: UInt64) {
randomSource = GKMersenneTwisterRandomSource(seed: seed)
}
mutating func next() -> UInt64 {
let upperBits = UInt64(UInt32(bitPattern: Int32(randomSource.nextInt()))) << 32
let lowerBits = UInt64(UInt32(bitPattern: Int32(randomSource.nextInt())))
return upperBits | lowerBits
}
}
#endif
......@@ -20,10 +20,22 @@
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// Created by Mingchung Xia on 2024-01-28.
//
import Foundation
struct SeedableRNG: RandomNumberGenerator {
private var seed: UInt64
init(seed: UInt64) {
self.seed = seed
}
//extension DictionaryCorpus: RangeReplaceableCollection {
//
// func replaceSubrange<C: Collection>(_ range: Range<DictionaryCorpus.Index>, with newElements: C) where DictionaryCorpus.Element == C.Element {
//
// }
//}
mutating func next() -> UInt64 {
let lcg: UInt64 = 6364136223846793005
seed = lcg &* seed &+ 1
return seed
}
}
......@@ -23,26 +23,38 @@
import Foundation
class DictionaryCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus {
private var _documentEncoder: any SNLPEncoder
var zeroes: [Scalar] { _documentEncoder.zeroes as! [Scalar] }
struct InMemoryCorpus<Item: SNLPDataItem, Encoder: SNLPEncoder>: SNLPCorpus {
var encodedDocuments: [Int : [Scalar] ] = [:]
internal var documentEncoder: Encoder
internal var documents = ContiguousArray<Item>()
internal var encodedDocuments = ContiguousArray<[Encoder.Scalar]>()
var count: Int { encodedDocuments.count }
init(item: Item = String(), encoder: Encoder = Encoder()) {
documentEncoder = encoder
}
init(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, scalar: Scalar.Type = Double.self) {
_documentEncoder = ContextFreeEncoder(source: encoding)
init(encoder: Encoder) {
documentEncoder = encoder
}
init(encoder: any SNLPEncoder, scalar: Scalar.Type = Double.self) {
_documentEncoder = encoder
}
/*
Implements a naive search function ... better to use a more efficient data structure
TODO: Replace this with a brute force search implementation?
*/
func searchFor(_ query: String) -> [Item] {
let q = documentEncoder.encodeSentence(query)
if let index = encodedDocuments.firstIndex(of: q) {
return [documents[index]]
}
@inlinable
func addUntokenizedDocument(_ document: String) {
encodedDocuments[ encodedDocuments.count ] = (_documentEncoder.encodeSentence(document) as! [Scalar])
return []
}
}
//
// File.swift
//
//
// Created by Jim Wallace on 2024-04-03.
//
import Foundation
/*
Provides a bare bones implementation of SNLPDataItem so that String can be used in test cases
- Not a particularly reliable set of defaults, but enough to work with text
*/
extension String: SNLPDataItem {
public var createdOn: Date { Date.distantFuture }
public var id: String { self }
public var fullText: String { self }
}
......@@ -62,7 +62,7 @@ extension ContextFreeEncoder {
// These use memory mapping to load the values in more quickly
// TODO: Validate that this actually works on other systems... could easily be some issues
static func readDictionaryFromFile(_ url: URL) -> [String : [Scalar]] {
static func readDictionaryFromFile(_ url: URL, width: Int = 50) -> [String : [Scalar]] {
//let fileURL = URL(fileURLWithPath: filename)
var result: [String : [Scalar]]
......@@ -87,7 +87,7 @@ extension ContextFreeEncoder {
index = stringRange.upperBound
// Read the values
let valuesData = data[index..<(index + 50 * MemoryLayout<Double>.size)]
let valuesData = data[index..<(index + width * MemoryLayout<Double>.size)]
let values = valuesData.withUnsafeBytes { Array($0.bindMemory(to: Scalar.self)) }
// Add the key-value pair to the dictionary
......@@ -95,7 +95,7 @@ extension ContextFreeEncoder {
//debugPrint("\(key) -> \(values[0])")
}
index += 50 * MemoryLayout<Double>.size //TODO: Why is this magical 50 here?
index += width * MemoryLayout<Double>.size
} else {
break
}
......
......@@ -23,10 +23,10 @@
import Foundation
class ContextFreeEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder {
struct ContextFreeEncoder<Scalar: BinaryFloatingPoint>: SNLPEncoder {
var dictionary: [String : [Scalar]]
let width: Int
let dimensions: UInt
var zeroes: [Scalar]
var count: Int { dictionary.count }
......@@ -34,25 +34,29 @@ class ContextFreeEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder {
public enum PreComputedEmbeddings {
case glove6B50d
case glove6B100d
//case NLEmbedding
}
init(source: PreComputedEmbeddings) {
init() {
self.init(source: .glove6B50d)
}
init(source: PreComputedEmbeddings = .glove6B50d) {
dictionary = Dictionary<String,[Scalar]>()
var dictionaryToLoad: String
switch source {
case .glove6B50d:
width = 50
dimensions = 50
dictionaryToLoad = "glove.6B.50d"
case .glove6B100d:
width = 100
dimensions = 100
dictionaryToLoad = "glove.6B.100d"
}
zeroes = Array(repeating: Scalar(0), count: width) as! [Scalar]
zeroes = Array(repeating: Scalar(0), count: Int(dimensions))
// Try to load locally first
guard let url = Bundle.module.url(forResource: dictionaryToLoad, withExtension: "mmap") else {
......@@ -62,7 +66,6 @@ class ContextFreeEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder {
return
}
dictionary = ContextFreeEncoder<Scalar>.readDictionaryFromFile(url)
}
subscript(_ token: String) -> [Scalar] {
......@@ -77,7 +80,6 @@ class ContextFreeEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder {
func encodeToken(_ token: String) -> [Scalar] {
//print("\(token) --> \(dictionary[token] ?? zeroes)")
return dictionary[token] ?? zeroes
}
......@@ -89,7 +91,6 @@ class ContextFreeEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder {
result[i] += encoding[i]
}
}
//print("\(sentence) --> \(result)")
return result
}
}
......@@ -26,9 +26,10 @@ import Foundation
import CoreML
class CoreMLEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder {
var zeroes: [Scalar]
struct CoreMLEncoder<Scalar: BinaryFloatingPoint>: SNLPEncoder {
var zeroes: [Scalar] = []
var dimensions: UInt = 0
func encodeToken(_ token: String) -> [Scalar] {
fatalError("CoreMLEncoder not implemented yet. Get on it.")
......
......@@ -25,10 +25,11 @@
import Foundation
import NaturalLanguage
class NaturalLanguageEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder {
var zeroes: [Scalar] { Array(repeating: Scalar(0), count: 512) }
struct NaturalLanguageEncoder<Scalar: BinaryFloatingPoint>: SNLPEncoder {
var dimensions: UInt = 512
var zeroes: [Scalar] { Array(repeating: Scalar(0), count: Int(dimensions)) }
@inlinable
func encodeToken(_ token: String) -> [Scalar] {
if let embedding = NLEmbedding.wordEmbedding(for: .english) {
......
// Copyright (c) 2024 Henry Tian, Jim Wallace
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//#if os(macOS)
//import Foundation
//import NaturalLanguage
//
//class NaturalLanguageContextualEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPEncoder {
//
// var zeroes: [Scalar] { Array(repeating: Scalar(0), count: 512) }
//
//
// @inlinable
// func encodeToken(_ token: String) -> [Scalar] {
// if let embedding = NLContextualEmbedding(language: .english) {
// return embedding.vector(for: token) as! [Scalar]
// }
// return zeroes
// }
//
// /**
// Adds a single untokenized document to the corpus, using default tokenization and text processing
// */
// @inlinable
// func encodeSentence(_ sentence: String) -> [Scalar] {
// if let embedding = NLEmbedding.sentenceEmbedding(for: .english) {
// if let result = embedding.vector(for: sentence) {
// return result as! [Scalar]
// }
// }
// return zeroes
// }
//
//}
//#endif
......@@ -23,10 +23,14 @@
import Foundation
class OpenAIEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPAsyncEncoder {
struct OpenAIEncoder<Scalar: BinaryFloatingPoint & Codable>: SNLPAsyncEncoder {
var zeroes: [Scalar]
var dimensions: UInt
init() {
fatalError()
}
func fetchEncodingForToken(_ token: String) async throws -> [Scalar] {
fatalError("OpenAIEncoder not implemented. Get on it.")
......
#if canImport(GameplayKit) && os(macOS)
import SwiftUI
import HNSWAlgorithm
import HNSWSample
// MARK: go to Product -> Scheme -> SwiftNLPVisualizer then run
// TODO: Support this for SwiftNLP data structures instead of the sample
struct GraphView: View {
let points: [(Int, CGPoint)]
let edges: [(CGPoint, CGPoint)]
var body: some View {
Canvas { context, size in
for (startPoint, endPoint) in edges {
var path = Path()
path.move(to: startPoint)
path.addLine(to: endPoint)
context.stroke(path, with: .color(.black), lineWidth: 1)
}
for (id, point) in points {
context.fill(
Circle().path(in: CGRect(x: point.x - 5, y: point.y - 5, dimensions: 10, height: 10)),
with: .color(.blue)
)
context.draw(Text("\(id)").bold().foregroundColor(.red), in: CGRect(x: point.x, y: point.y, dimensions: 20, height: 20))
}
}
.frame(maxWidth: .infinity, maxHeight: .infinity)
}
}
extension DeterministicSampleVectorIndex {
func points(for level: Int) -> [(Int, CGPoint)] {
base.graph.keys(on: level).map { id in
(id, base.vectors[id])
}
}
func edges(for level: Int) -> [(CGPoint, CGPoint)] {
base.graph.keys(on: level).flatMap { id in
base.graph.neighborhood(on: level, around: id).map { neighbor in
return (base.vectors[id], base.vectors[neighbor])
}
}
}
}
struct VisualizerView: View {
@State var index = DeterministicSampleVectorIndex(typicalNeighborhoodSize: 6)
@State var angle: Angle = .zero
@State var updateCount = 0 // since index isn't observable!
var body: some View {
VStack {
HStack {
Button("Add Data") {
index.insertRandom(range: 0...500)
updateCount += 1
}
Slider(value: $angle.degrees, in: 0...89)
.frame(dimensions: 100)
}
.padding()
ScrollView {
VStack {
let graph = index.base.graph
ForEach(Array(sequence(state: graph.entry?.level, next: graph.descend)), id: \.self) { level in
let _ = updateCount // to force an update
Text("Level \(String(level))")
GraphView(
points: index.points(for: level),
edges: index.edges(for: level)
)
.rotation3DEffect(angle, axis: (1, 0, 0), perspective: 0)
.frame(dimensions: 600, height: 600, alignment: .top)
.frame(dimensions: 600, height: 600 * cos(angle.radians))
Divider()
}
}
}
}
}
}
@main
struct HNSWVisualizerApp: App {
@NSApplicationDelegateAdaptor(AppDelegate.self) var appDelegate
var body: some Scene {
WindowGroup {
VisualizerView()
}
}
}
class AppDelegate: NSObject, NSApplicationDelegate {
func applicationDidFinishLaunching(_ notification: Notification) {
NSApp.setActivationPolicy(.regular)
}
}
#endif
#if os(macOS)
import XCTest
import Foundation
import CoreLMDB
import System
@testable import SwiftNLP
// MARK: These tests are not to be included within the pipeline
final class DurableHNSWCorpusTests: XCTestCase {
/// This is used to skip these tests in the GitLab pipeline
override class var defaultTestSuite: XCTestSuite {
if ProcessInfo.processInfo.environment["SKIP_TESTS"] == "DurableHNSWCorpusTests" {
return XCTestSuite(name: "Empty")
}
return super.defaultTestSuite
}
/// Setting up constants for environment
private let ONE_GB: Int = 1_073_741_824
private let ONE_MB: Int = 1_048_576
private let ONE_KB: Int = 1_024
private let ONE_B: Int = 1
private let DEFAULT_MAXREADERS: UInt32 = 126
private let DEFAULT_MAXDBS: UInt32 = 10
/// Setting up working directory
private var workingDirectoryPath: FilePath!
override func setUpWithError() throws {
try super.setUpWithError()
let fileManager = FileManager.default
let directoryURL = fileManager.homeDirectoryForCurrentUser.appendingPathComponent("/Downloads/lmdb")
try fileManager.createDirectory(at: directoryURL, withIntermediateDirectories: true, attributes: nil)
workingDirectoryPath = FilePath(directoryURL.path)
/// This commented out code alternatively works in the XCode bundle resource environment
// guard let resourcesPath = Bundle.module.resourcePath else { fatalError("Failed to find resource path.") }
// let resourcesDirectoryURL = URL(fileURLWithPath: resourcesPath).appendingPathComponent("lmdb")
// let fileManager = FileManager.default
// try fileManager.createDirectory(at: resourcesDirectoryURL, withIntermediateDirectories: true, attributes: nil)
// print("Resources directory: \(resourcesDirectoryURL)")
// workingDirectoryPath = FilePath(resourcesDirectoryURL.path)
}
func testBuildBasicCorpus() throws {
let docs = [
"CNTK formerly known as Computational Network Toolkit",
"is a free easy-to-use open-source commercial-grade toolkit",
"that enable us to train deep learning algorithms to learn like the human brain."
]
/// Setting up the environment
let env = try Environment()
try env.setMapSize(ONE_GB)
try env.setMaxReaders(DEFAULT_MAXREADERS)
try env.setMaxDBs(DEFAULT_MAXDBS)
try env.open(path: workingDirectoryPath)
/// Writing to LMDB
let transaction = try Transaction.begin(.write, in: env)
let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
namespace: "testBasicExample",
in: transaction
)
for doc in docs {
try corpus.addUntokenizedDocument(doc, in: transaction)
}
try transaction.commit()
/// Reading from LMDB
let readTransaction = try Transaction.begin(.read, in: env)
let _ = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
namespace: "testBasicExample",
in: readTransaction
)
readTransaction.abort()
// XCTAssert(readCorpus.count == 3)
/// readCorpus.count == 3 will fail because we have not fixed the bug with setting size upon reads
/// This is because size is only incremented when insertion is called but it is not called when read from disk!
}
func testQueryBasicCorpus() async throws {
let docs = [
"The quick brown fox jumps over the lazy dog",
"I enjoy taking long walks along the beach at sunset",
"Advances in neural networks have enabled new AI capabilities",
"The stock market experienced a significant downturn last week",
"Cooking a good meal can be both an art and a science",
"The exploration of space is both challenging and rewarding",
"Machine learning models are becoming increasingly sophisticated",
"I love reading about history and ancient civilizations"
]
let query = "I like to read about new technology and artificial intelligence"
//let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
/// Setting up the environment
let env = try Environment()
try env.setMapSize(ONE_GB)
try env.setMaxReaders(DEFAULT_MAXREADERS)
try env.setMaxDBs(DEFAULT_MAXDBS)
try env.open(path: workingDirectoryPath)
let transaction = try Transaction.begin(.write, in: env)
/// Saving the memory map to disk
let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
namespace: "testBasicQueryExample",
in: transaction
)
for doc in docs {
try corpus.addUntokenizedDocument(doc, in: transaction)
}
corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
try transaction.commit()
do {
let queryVector: [Double] = corpus.documentEncoder.encodeToken(query).map { Double($0) }
/// Reading the memory map (and dictionary) from disk
let readTransaction = try Transaction.begin(.write, in: env)
let readCorpus = try DurableHNSWCorpus<String,ContextFreeEncoder<Double>>(
namespace: "testBasicQueryExample",
in: readTransaction
)
readCorpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap") // TODO: move this to initializer?
let result = try readCorpus.index.find(near: queryVector, limit: 8, in: transaction)
for result in result {
let key = Int(result.id.foreignKey)!
print(readCorpus.getUntokenizedDocument(at: key))
}
} catch {
print("Error when trying corpus.encodedDocuments.find(): \(error)")
}
try transaction.commit()
}
func testBuildGuelphSubredditCorpus() async throws {
/// Generates the LMDB durable storage to disk but runs no tests otherwise
/// Setting up the environment
let env = try Environment()
try env.setMapSize(ONE_GB)
try env.setMaxReaders(DEFAULT_MAXREADERS)
try env.setMaxDBs(DEFAULT_MAXDBS)
try env.open(path: workingDirectoryPath)
/// Get subreddit data
guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
fatalError("Failed to find waterloo_submissions.zst in test bundle.")
}
guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
fatalError("Failed to load waterloo_submissions.zst from test bundle.")
}
let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
let transaction = try Transaction.begin(.write, in: env)
let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>(
encoder: documentEncoder,
namespace: "subreddit_durable",
in: transaction
)
/// Add documents to corpus
for submission in submissions {
if let text = submission.selftext {
try corpus.addUntokenizedDocument(text, in: transaction)
}
}
/// Save dictionary to disk
corpus.saveDictionaryToDownloads(fileName: "dictionary.mmap")
try transaction.commit()
}
func testQueryGuelphSubredditCorpus() async throws {
let documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
/// Setting up the environment
let env = try Environment()
try env.setMapSize(ONE_GB)
try env.setMaxReaders(DEFAULT_MAXREADERS)
try env.setMaxDBs(DEFAULT_MAXDBS)
try env.open(path: workingDirectoryPath)
/// Reading the memory map (and dictionary) from disk
let transaction = try Transaction.begin(.read, in: env)
let corpus = try DurableHNSWCorpus<String,ContextFreeEncoder>(
encoder: documentEncoder,
namespace: "subreddit_durable",
in: transaction
)
corpus.dictionary = DurableHNSWCorpus.readDictionaryFromDownloads(fileName: "dictionary.mmap")
let query = "I love waterloo and I love the geese."
let queryVector: [Double] = documentEncoder.encodeToken(query).map { Double($0) }
let result = try corpus.index.find(near: queryVector, limit: 8, in: transaction)
for result in result {
let key = Int(result.id.foreignKey)!
print(corpus.getUntokenizedDocument(at: key))
}
}
}
#endif
#if os(macOS)
import XCTest
import Foundation
import System
@testable import SwiftNLP
final class EphemeralHNSWCorpusTests: XCTestCase {
// MARK: EphemeralHNSWCorpus can also be used as its typealias HNSWCorpus
// Load a small set of documents and confirm that corpus and dictionary are updated accordingly
func testBuildBasicCorpus() throws {
let docs = [
"CNTK formerly known as Computational Network Toolkit",
"is a free easy-to-use open-source commercial-grade toolkit",
"that enable us to train deep learning algorithms to learn like the human brain."
]
var corpus = HNSWCorpus<String,ContextFreeEncoder<Double>>()
corpus.addUntokenizedDocuments(docs)
XCTAssert(corpus.count == 3)
/// Make sure none of our encodings are zero
for item in corpus {
XCTAssertNotEqual(item.vector, corpus.zeroes)
}
}
// Load a bigger set of documents and confirm
func testBuildLargeCorpus() throws {
let twentyQuotes = [
"Imagination is more important than knowledge. - Albert Einstein",
"The greatest enemy of knowledge is not ignorance, it is the illusion of knowledge. - Stephen Hawking",
"If I have seen further it is by standing on the shoulders of giants. - Isaac Newton",
"The universe is a wondrous place! The faster you create unbreakable code, the faster the universe creates people that can break it. - Richard Feynman",
"Science is the belief in the ignorance of experts. - Richard Feynman",
"The saddest aspect of life right now is that science gathers knowledge faster than society gathers wisdom. - Isaac Asimov",
"Science is the poetry of reality. - Richard Dawkins",
"To raise new questions, new possibilities, to regard old problems from a new angle, requires creative imagination and marks real advance in science. - Albert Einstein",
"The scientist does not study nature because it is useful; he studies it because he delights in it, and he delights in it because it is beautiful. - Henri Poincaré",
"Nothing in life is to be feared, it is only to be understood. Now is the time to understand more, so that we may fear less. - Marie Curie",
"An experiment is a question which science poses to Nature, and a measurement is the recording of Nature’s answer. - Max Planck",
"If you wish to make an apple pie from scratch, you must first invent the universe. - Carl Sagan",
"The function of science fiction is not always to predict the future but sometimes to prevent it. - Frank Herbert",
"Science is what we understand well enough to explain to a computer. Art is everything else we do. - Donald Knuth",
"In science one tries to tell people, in such a way as to be understood by everyone, something that no one ever knew before. But in poetry, it's the exact opposite. - Paul Dirac",
"Science is a way of thinking much more than it is a body of knowledge. - Carl Sagan",
"Research is what I’m doing when I don’t know what I’m doing. - Wernher von Braun",
"The most beautiful thing we can experience is the mysterious. It is the source of all true art and science. - Albert Einstein",
"One, remember to look up at the stars and not down at your feet. Two, never give up work. Work gives you meaning and purpose and life is empty without it. Three, if you are lucky enough to find love, remember it is there and don't throw it away. - Stephen Hawking",
"All science is either physics or stamp collecting. - Ernest Rutherford"
]
var corpus = HNSWCorpus<String,ContextFreeEncoder<Double>>()
corpus.addUntokenizedDocuments(twentyQuotes)
XCTAssertEqual(corpus.count, 20)
/// Make sure none of our encodings are zero
for item in corpus {
XCTAssertNotEqual(item.vector, corpus.zeroes)
}
}
func testBuildGuelphSubredditCorpus() async throws {
guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
fatalError("Failed to find waterloo_submissions.zst in test bundle.")
}
guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
fatalError("Failed to load waterloo_submissions.zst from test bundle.")
}
let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
let corpus = HNSWCorpus<String,ContextFreeEncoder<Double>>()
for submission in submissions {
if let text = submission.selftext {
corpus.addUntokenizedDocument(text)
}
}
XCTAssert(corpus.count == 17999)
}
// Load a small set of documents and confirm that corpus and dictionary are updated accordingly
func testQueryBasicCorpus() async throws {
let docs = [
"The quick brown fox jumps over the lazy dog",
"I enjoy taking long walks along the beach at sunset",
"Advances in neural networks have enabled new AI capabilities",
"The stock market experienced a significant downturn last week",
"Cooking a good meal can be both an art and a science",
"The exploration of space is both challenging and rewarding",
"Machine learning models are becoming increasingly sophisticated",
"I love reading about history and ancient civilizations"
]
let query = "I like to read about new technology and artificial intelligence"
//let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
var corpus = HNSWCorpus<String,ContextFreeEncoder<Double>>()
corpus.addUntokenizedDocuments(docs)
//do {
//let queryVector: [Double] = _documentEncoder.encodeToken(query).map { Double($0) }
//let results = try corpus.index.find(near: queryVector, limit: 8)
let results = corpus.searchFor(query)
for result in results {
print(result)
}
//} catch {
// print("Error when trying corpus.encodedDocuments.find(): \(error)")
//}
}
func testQueryLargeCorpus() async throws {
let docs = [
"Imagination is more important than knowledge. - Albert Einstein",
"The greatest enemy of knowledge is not ignorance, it is the illusion of knowledge. - Stephen Hawking",
"If I have seen further it is by standing on the shoulders of giants. - Isaac Newton",
"The universe is a wondrous place! The faster you create unbreakable code, the faster the universe creates people that can break it. - Richard Feynman",
"Science is the belief in the ignorance of experts. - Richard Feynman",
"The saddest aspect of life right now is that science gathers knowledge faster than society gathers wisdom. - Isaac Asimov",
"Science is the poetry of reality. - Richard Dawkins",
"To raise new questions, new possibilities, to regard old problems from a new angle, requires creative imagination and marks real advance in science. - Albert Einstein",
"The scientist does not study nature because it is useful; he studies it because he delights in it, and he delights in it because it is beautiful. - Henri Poincaré",
"Nothing in life is to be feared, it is only to be understood. Now is the time to understand more, so that we may fear less. - Marie Curie",
"An experiment is a question which science poses to Nature, and a measurement is the recording of Nature’s answer. - Max Planck",
"If you wish to make an apple pie from scratch, you must first invent the universe. - Carl Sagan",
"The function of science fiction is not always to predict the future but sometimes to prevent it. - Frank Herbert",
"Science is what we understand well enough to explain to a computer. Art is everything else we do. - Donald Knuth",
"In science one tries to tell people, in such a way as to be understood by everyone, something that no one ever knew before. But in poetry, it's the exact opposite. - Paul Dirac",
"Science is a way of thinking much more than it is a body of knowledge. - Carl Sagan",
"Research is what I’m doing when I don’t know what I’m doing. - Wernher von Braun",
"The most beautiful thing we can experience is the mysterious. It is the source of all true art and science. - Albert Einstein",
"One, remember to look up at the stars and not down at your feet. Two, never give up work. Work gives you meaning and purpose and life is empty without it. Three, if you are lucky enough to find love, remember it is there and don't throw it away. - Stephen Hawking",
"All science is either physics or stamp collecting. - Ernest Rutherford"
]
let query = "I love Albert Einstein!"
var corpus = HNSWCorpus<String,ContextFreeEncoder<Double>>()
corpus.addUntokenizedDocuments(docs)
let results = corpus.searchFor(query)
for result in results {
print(result)
}
}
func testQueryGuephSubredditCorpus() async throws {
guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
fatalError("Failed to find guelph_submissions.zst in test bundle.")
}
guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
fatalError("Failed to load guelph_submissions.zst from test bundle.")
}
let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
//let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
let corpus = HNSWCorpus<String,ContextFreeEncoder<Double>>(typicalNeighborhoodSize: 10)
for submission in submissions {
if let text = submission.selftext {
corpus.addUntokenizedDocument(text)
}
}
let query = "Mr. Goose is a very important figure at the University of Waterloo."
let results = corpus.searchFor(query)
for result in results {
print(result)
}
}
func testTypicalNeighborhoodSize() async throws {
// guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
// fatalError("Failed to find waterloo_submissions.zst in test bundle.")
// }
// guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
// fatalError("Failed to load waterloo_submissions.zst from test bundle.")
// }
//
// let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
//
// let typicalNeighborhoodSizes = [2, 8, 16, 32, 64, 128, 512, 1028]
//
// for typicalNeighborhoodSize in typicalNeighborhoodSizes {
// let startTime = Date()
// var corpus = HNSWCorpus(encoding: .glove6B50d, typicalNeighborhoodSize: typicalNeighborhoodSize)
//
// for submission in submissions {
// if let text = submission.selftext {
// corpus.addUntokenizedDocument(text)
// }
// }
//
// XCTAssert(corpus.count == 17999)
//
// let endTime = Date()
// print("Typical neighborhood size \(typicalNeighborhoodSize) took \(endTime.timeIntervalSince(startTime)) seconds.")
// }
}
}
#endif
......@@ -13,13 +13,13 @@ final class ContextFreeEncoderTests: XCTestCase {
"that enable us to train deep learning algorithms to learn like the human brain."
]
var corpus = DictionaryCorpus(encoding: .glove6B50d)
var corpus = InMemoryCorpus<String,ContextFreeEncoder<Double>>()
corpus.addUntokenizedDocuments(docs)
XCTAssert(corpus.encodedDocuments.count == 3)
// Make sure none of our encodings are zero
for c in corpus {
for c in corpus.encodedDocuments {
XCTAssertNotEqual(c, corpus.zeroes)
}
}
......@@ -50,14 +50,14 @@ final class ContextFreeEncoderTests: XCTestCase {
"All science is either physics or stamp collecting. - Ernest Rutherford"
]
var corpus = DictionaryCorpus(encoding: .glove6B50d)
var corpus = InMemoryCorpus<String,ContextFreeEncoder<Double>>()
corpus.addUntokenizedDocuments(twentyQuotes)
XCTAssertEqual(corpus.encodedDocuments.count, 20)
// Make sure none of our encodings are zero
for c in corpus {
for c in corpus.encodedDocuments {
XCTAssertNotEqual(c, corpus.zeroes)
}
}
......@@ -65,17 +65,17 @@ final class ContextFreeEncoderTests: XCTestCase {
func testSubreddit() async throws {
guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
fatalError("Failed to find waterloo_submissions.zst in test bundle.")
fatalError("Failed to find guelph_submissions.zst in test bundle.")
}
guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
fatalError("Failed to load waterloo_submissions.zst from test bundle.")
fatalError("Failed to load guelph_submissions.zst from test bundle.")
}
let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
//print("Errors: \(errors.count)")
let corpus = DictionaryCorpus(encoding: .glove6B50d)
var corpus = InMemoryCorpus<String,ContextFreeEncoder<Double>>()
for submission in submissions {
if let text = submission.selftext {
corpus.addUntokenizedDocument(text)
......
......@@ -13,15 +13,14 @@ final class NaturalLanguageEncoderTests: XCTestCase {
"is a free easy-to-use open-source commercial-grade toolkit",
"that enable us to train deep learning algorithms to learn like the human brain."
]
let encoder = NaturalLanguageEncoder<Double>()
var corpus = DictionaryCorpus(encoder: encoder)
var corpus = InMemoryCorpus<String,NaturalLanguageEncoder<Double>>()
corpus.addUntokenizedDocuments(docs)
XCTAssert(corpus.encodedDocuments.count == 3)
// Make sure none of our encodings are zero
for c in corpus {
for c in corpus.encodedDocuments {
XCTAssertNotEqual(c, corpus.zeroes)
}
}
......@@ -51,16 +50,15 @@ final class NaturalLanguageEncoderTests: XCTestCase {
"One, remember to look up at the stars and not down at your feet. Two, never give up work. Work gives you meaning and purpose and life is empty without it. Three, if you are lucky enough to find love, remember it is there and don't throw it away. - Stephen Hawking",
"All science is either physics or stamp collecting. - Ernest Rutherford"
]
let encoder = NaturalLanguageEncoder<Double>()
var corpus = DictionaryCorpus(encoder: encoder)
var corpus = InMemoryCorpus<String,NaturalLanguageEncoder<Double>>()
corpus.addUntokenizedDocuments(twentyQuotes)
XCTAssertEqual(corpus.encodedDocuments.count, 20)
// Make sure none of our encodings are zero
for c in corpus {
for c in corpus.encodedDocuments {
XCTAssertNotEqual(c, corpus.zeroes)
}
}
......@@ -68,16 +66,16 @@ final class NaturalLanguageEncoderTests: XCTestCase {
func testSubreddit() async throws {
guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else {
fatalError("Failed to find waterloo_submissions.zst in test bundle.")
fatalError("Failed to find guelph_submissions.zst in test bundle.")
}
guard let submissionsData = try? Data(contentsOf: submissionsURL) else {
fatalError("Failed to load waterloo_submissions.zst from test bundle.")
fatalError("Failed to load guelph_submissions.zst from test bundle.")
}
let (submissions, _ ): ([Submission],[Data]) = try await loadFromRedditArchive(submissionsData)
let encoder = NaturalLanguageEncoder<Double>()
var corpus = DictionaryCorpus(encoder: encoder)
var corpus = InMemoryCorpus<String,NaturalLanguageEncoder<Double>>(encoder: encoder)
for submission in submissions {
if let text = submission.selftext {
......
//#if os(macOS)
//import XCTest
//import Foundation
//import NaturalLanguage
//@testable import SwiftNLP
//
//// MARK: See AllMiniLM_pipelineTest.swift
////TODO: Find where TestUtils comes from to fix
//
//final class HNSWPipelineTest: XCTestCase {
//
// // test fetching names of all the files
// func testFileNameFetching() throws {
// let redditCommentNames = TestUtils.getJsonFiles(prefix: "RC")
// print("reddit comment files: \(redditCommentNames)")
// let redditSubmissionNames = TestUtils.getJsonFiles(prefix: "RS")
// print("reddit submission files: \(redditSubmissionNames)")
// }
//
// // test reading reddit submission json files into actual objects
// func testRedditSubmissions() throws {
// let redditSubmissionJson = TestUtils.loadAllRedditSubmission()
// for jsonData in redditSubmissionJson {
// let redditSubmission = TestUtils.readRedditSubmissionJson(json: jsonData)
// XCTAssertNotNil(redditSubmission, "Failed to decode RedditSubmissionData")
// }
// }
//
// // test reading reddit comment json files into actual objects
// func testRedditComments() throws {
// let redditCommentJson = TestUtils.loadAllRedditComment()
// for jsonData in redditCommentJson {
// let redditComment = TestUtils.readRedditCommentJson(json: jsonData)
// XCTAssertNotNil(redditComment, "Failed to decode RedditCommentData")
// }
// }
//
// func test20kDownload() async throws {
//
// let result = try await downloadSubredditFromServer(subreddit: "StopGaming")
// print("Loaded \(result.count) threads from server.")
// if let random = result.randomElement() {
// let (key, value) = random
// print("Key: \(key), Value: \(value)")
// }
// XCTAssertEqual(result.count, 34829, "Failed to load subreddit data from https://reddit-top20k.cworld.ai")
//
// }
//
//
// func testDocumentReading() async throws {
// // loads all json data for test documents
// let redditCommentJson = TestUtils.loadAllRedditComment()
// let redditSubmissionJson = TestUtils.loadAllRedditSubmission()
//
// let redditComments = redditCommentJson.compactMap { TestUtils.readRedditCommentJson(json: $0)}
// let redditSubmissions = redditSubmissionJson.compactMap { TestUtils.readRedditSubmissionJson(json: $0) }
//
// var bodies: [String] = []
//
// // load all the reddit comments' body as comment to the document
// for comment in redditComments {
// //debugPrint("Processing \(comment.posts.count) comments")
//
// for post in comment.posts {
// if let body = post.body {
// bodies.append(body)
// }
// }
// }
//
// for submission in redditSubmissions {
// //debugPrint("Processing \(submission.posts.count) submissions")
//
// for post in submission.posts {
// if let p = post.selftext {
// //debugPrint(p)
// bodies.append(p)
// }
// }
// }
//
// // Debug code
//// bodies = Array(bodies.prefix(10))
//// print(bodies)
//
// // start to encode the db and query
//// var database_embedding: [[Float]] = []
//// var query_embedding: [Float] = []
//// let query = "stop playing video games"
//// var embedding_dim: Int = 384
//// var model = MiniLMEmbeddings()
//// query_embedding = await model.encode(sentence: query)!
////
//// var i = 1
//// //append sentence embedding to database_embedding
//// for string in bodies {
//// if let vector = await model.encode(sentence: string) {
//// database_embedding.append(vector)
//// //print(i)
//// i += 1
//// } else {
//// fatalError("Error occurred1")
//// }
////
//// }
////
// let _documentEncoder = ContextFreeEncoder<Double>(source: .glove6B50d)
// var corpus = HNSWCorpus(encoder: _documentEncoder)
// corpus.addUntokenizedDocuments(bodies)
//
// let size = MemoryLayout.size(ofValue: corpus)
// print("Approximate memory footprint: \(size) bytes")
//
// do {
// print("Attempting to query corpus.encodedDocuments.find()...")
// let query = "stop playing video games"
// let queryVector = _documentEncoder.encodeToken(query)
// let results = try corpus.encodedDocuments.find(near: queryVector, limit: 10)
// print(results)
// print("Query completed!")
// } catch {
// print("Error when trying corpus.encodedDocuments.find(): \(error)")
// }
//
//// let index = AnnoyIndex<Float>(itemLength: embedding_dim, metric: .euclidean)
////
//// try? index.addItems(items: &database_embedding)
//// try? index.build(numTrees: 50)
////
//// let results = index.getNNsForVector(vector: &query_embedding, neighbors: 10)
////
//// print(results)
// }
//}
//#endif