Skip to content
Snippets Groups Projects
Commit 3da0e33a authored by Jim Wallace's avatar Jim Wallace
Browse files

Merge branch 'hnsw' into 'main'

HNSW Implementation with Testcases

See merge request jrwallace/swiftnlp!13
parents e2158933 452de2bd
No related branches found
No related tags found
1 merge request!13HNSW Implementation with Testcases
Pipeline #115587 passed with warnings
Showing
with 1214 additions and 24 deletions
...@@ -18,6 +18,7 @@ build-macOS: ...@@ -18,6 +18,7 @@ build-macOS:
test-macOS: test-macOS:
stage: test stage: test
script: script:
- export SKIP_TESTS=DurableHNSWCorpusTests
- swift test -c release -Xswiftc -enable-testing - swift test -c release -Xswiftc -enable-testing
# - swift test --sanitize=address -c release -Xswiftc -enable-testing # - swift test --sanitize=address -c release -Xswiftc -enable-testing
# - swift test --sanitize=thread -c release -Xswiftc -enable-testing # - swift test --sanitize=thread -c release -Xswiftc -enable-testing
......
...@@ -79,7 +79,7 @@ ...@@ -79,7 +79,7 @@
</Testables> </Testables>
</TestAction> </TestAction>
<LaunchAction <LaunchAction
buildConfiguration = "Debug" buildConfiguration = "Release"
selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB" selectedDebuggerIdentifier = "Xcode.DebuggerFoundation.Debugger.LLDB"
selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB" selectedLauncherIdentifier = "Xcode.DebuggerFoundation.Launcher.LLDB"
launchStyle = "0" launchStyle = "0"
......
...@@ -36,6 +36,15 @@ ...@@ -36,6 +36,15 @@
"version" : "0.1.14" "version" : "0.1.14"
} }
}, },
{
"identity" : "surge",
"kind" : "remoteSourceControl",
"location" : "https://github.com/Jounce/Surge.git",
"state" : {
"revision" : "6e4a47e63da8801afe6188cf039e9f04eb577721",
"version" : "2.3.2"
}
},
{ {
"identity" : "swift-numerics", "identity" : "swift-numerics",
"kind" : "remoteSourceControl", "kind" : "remoteSourceControl",
......
...@@ -6,32 +6,48 @@ import PackageDescription ...@@ -6,32 +6,48 @@ import PackageDescription
let package = Package( let package = Package(
name: "SwiftNLP", name: "SwiftNLP",
platforms: [ platforms: [
.macOS(.v13), .macOS(.v13),
], ],
products: [ products: [
.library( .library(
name: "SwiftNLP", name: "SwiftNLP",
targets: ["SwiftNLP"]), targets: ["SwiftNLP"]
),
/// This is commented out to fix the gitlab pipeline, but must be uncommented when in use on macOS only.
// .executable(
// name: "SwiftNLPVisualizer",
// targets: ["SwiftNLPVisualizer"]
// ),
], ],
dependencies: [ dependencies: [
//.package(url: "https://github.com/jbadger3/SwiftAnnoy", .upToNextMajor(from: "1.0.0")),
.package(url: "https://github.com/L1MeN9Yu/Elva", .upToNextMajor(from: "2.1.3")), .package(url: "https://github.com/L1MeN9Yu/Elva", .upToNextMajor(from: "2.1.3")),
.package(url: "https://github.com/JadenGeller/similarity-topology", .upToNextMajor(from: "0.1.14")) .package(url: "https://github.com/JadenGeller/similarity-topology", .exact("0.1.14")),
.package(url: "https://github.com/Jounce/Surge.git", .upToNextMajor(from: "2.0.0")),
// .package(url: "https://github.com/mingchungx/nifty.git", .branch("master"))
], ],
targets: [ targets: [
.target( .target(
name: "SwiftNLP", name: "SwiftNLP",
dependencies: [ dependencies: [
//"SwiftAnnoy",
.product(name: "HNSWAlgorithm", package: "similarity-topology"), .product(name: "HNSWAlgorithm", package: "similarity-topology"),
.product(name: "HNSWEphemeral", package: "similarity-topology"), .product(name: "HNSWEphemeral", package: "similarity-topology"),
.product(name: "HNSWDurable", package: "similarity-topology", condition: .when(platforms: [.macOS])),
.product(name: "HNSWSample", package: "similarity-topology", condition: .when(platforms: [.macOS])),
// .product(name: "Nifty", package: "Nifty"),
.product(name: "ZSTD", package: "Elva"), .product(name: "ZSTD", package: "Elva"),
.byName(name: "Surge", condition: .when(platforms: [.macOS])),
], ],
resources: [.process("Resources")] resources: [.process("Resources")]
), ),
.testTarget( .testTarget(
name: "SwiftNLPTests", name: "SwiftNLPTests",
dependencies: ["SwiftNLP"], dependencies: ["SwiftNLP"],
resources: [.process("Resources")]), resources: [.process("Resources")]
),
/// This is commented out to fix the gitlab pipeline, but must be uncommented when in use on macOS only.
// .executableTarget(
// name: "SwiftNLPVisualizer",
// dependencies: ["SwiftNLP"]
// ),
] ]
) )
...@@ -48,7 +48,7 @@ print(topicModel) ...@@ -48,7 +48,7 @@ print(topicModel)
- [ ] Linux via PythonKit? - [ ] Linux via PythonKit?
- *Topic Modelling* - *Topic Modelling*
- [ ] Linear Algebra (e.g., [Nifty](https://github.com/philipce/nifty), [Surge](https://github.com/Jounce/Surge)) - [X] Linear Algebra (e.g., [Nifty](https://github.com/philipce/nifty), [Surge](https://github.com/Jounce/Surge))
- [ ] Dimensionality Reduction (e.g., [t-SNE](https://github.com/emannuelOC/swift-tsne), [UMAP](https://github.com/LTLA/umappp)) - [ ] Dimensionality Reduction (e.g., [t-SNE](https://github.com/emannuelOC/swift-tsne), [UMAP](https://github.com/LTLA/umappp))
- [ ] Clustering (e.g., K-Means, HDBSCAN) - [ ] Clustering (e.g., K-Means, HDBSCAN)
- [ ] Topic models - [ ] Topic models
...@@ -62,6 +62,7 @@ This project is developed by a team of researchers from the [Human-Computer Inte ...@@ -62,6 +62,7 @@ This project is developed by a team of researchers from the [Human-Computer Inte
- Peter Li - Peter Li
- Adrian Davila - Adrian Davila
- Henry Tian - Henry Tian
- Mingchung Xia
If you would like to contribute to the project, [contact Prof. Wallace](mailto:james.wallace@uwaterloo.ca) with "SwiftNLP" in the subject line, and mention one or more of the roadmap items above that you would like to work on. If you would like to contribute to the project, [contact Prof. Wallace](mailto:james.wallace@uwaterloo.ca) with "SwiftNLP" in the subject line, and mention one or more of the roadmap items above that you would like to work on.
......
// Copyright (c) 2024 Jim Wallace
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// Created by Mingchung Xia on 2024-02-26.
//
#if os(macOS)
import Foundation
extension DurableHNSWCorpus {
/// This extension is used for the dictionary operations
public struct DocumentVectorPair {
var untokenizedDocument: String
var vector: [Scalar]
init(untokenizedDocument: String, vector: [Scalar]) {
self.untokenizedDocument = untokenizedDocument
self.vector = vector
}
}
@inlinable
func getUntokenizedDocument(at key: Int) -> String {
if let pair = dictionary[key] {
return pair.untokenizedDocument
} else {
fatalError("Key \(key) not found in HNSW dictionary")
}
}
@inlinable
func getVector(at key: Int) -> [Scalar] {
if let pair = dictionary[key] {
return pair.vector
} else {
fatalError("Key \(key) not found in HNSW dictionary")
}
}
@inlinable
func getDictionary() -> [Int: DocumentVectorPair] {
return dictionary
}
func addDocumentVectorPair(at key: Int, document: String, vector: [Scalar]) {
dictionary[key] = DocumentVectorPair(
untokenizedDocument: document,
vector: vector
)
}
}
#endif
// Copyright (c) 2024 Jim Wallace
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// Created by Mingchung Xia on 2024-03-12.
//
#if os(macOS)
import Foundation
import System
// MARK: This extension for saving and loading the memory map data of untokenized documents is currently used as a workaround
// This is because loading the memory mapped data using CoreLMDB does not load the untokenized documents (and the other fields) of a DurableHNSWCorpus so in order to write and read from disk of the original data, we need to have this workaround
// Eventually, all this code in this extension should be moved to the HNSWCorpusDataHandler after a general wrapper class for DurableHNSW and EmphemeralHNSW is made
extension DurableHNSWCorpus {
/// Saves untokenized documents in dictionary to disk to Downloads directory
func saveDictionaryToDownloads(fileName: String) {
guard let downloadsURL = FileManager.default.urls(for: .downloadsDirectory, in: .userDomainMask).first else {
print("Could not find Downloads directory")
return
}
let fileURL = downloadsURL.appendingPathComponent(fileName)
saveDictionaryMemoryMap(url: fileURL)
}
func saveDictionaryMemoryMap(url: URL) {
let fileManager = FileManager.default
if !fileManager.fileExists(atPath: url.path) {
fileManager.createFile(atPath: url.path, contents: nil, attributes: nil)
}
do {
let fileHandle = try FileHandle(forWritingTo: url)
let count = dictionary.count
let countData = withUnsafeBytes(of: count) { Data($0) }
fileHandle.write(countData)
for (key, value) in dictionary {
let keyData = withUnsafeBytes(of: key) { Data($0) }
fileHandle.write(keyData)
// Convert the untokenizedDocument (String) to Data
let documentData = value.untokenizedDocument.data(using: .utf8) ?? Data()
// Prefix the document data with its length to know how much to read when loading
let documentLengthData = withUnsafeBytes(of: documentData.count) { Data($0) }
fileHandle.write(documentLengthData)
fileHandle.write(documentData)
// Convert the vector ([Scalar]) to Data
let vectorData = value.vector.withUnsafeBytes { Data($0) }
// Prefix the vector data with its length to know how much to read when loading
let vectorLengthData = withUnsafeBytes(of: value.vector.count) { Data($0) }
fileHandle.write(vectorLengthData)
fileHandle.write(vectorData)
}
fileHandle.closeFile()
} catch {
print("Error writing dictionary to file: \(error)")
}
}
static func readDictionaryFromDownloads(fileName: String, width: Int = 50) -> HNSWDictionary {
guard let downloadsURL = FileManager.default.urls(for: .downloadsDirectory, in: .userDomainMask).first else {
print("Could not find Downloads directory")
return [:]
}
let fileURL = downloadsURL.appendingPathComponent(fileName)
return readDictionaryMemoryMap(fileURL, width: width)
}
/// Width is the number of dimensions of the glove encoding
// TODO: Improve this to not need to take in a width, rather switch between the encoding / encoder
static func readDictionaryMemoryMap(_ url: URL, width: Int = 50) -> HNSWDictionary {
var dictionary = HNSWDictionary()
do {
let data = try Data(contentsOf: url)
var index = 0
// Safely read the dictionary count
let countData = data.subdata(in: index..<index+MemoryLayout<Int>.size)
let count = countData.withUnsafeBytes { $0.load(as: Int.self) }
index += MemoryLayout<Int>.size
for _ in 0..<count {
// Safely read the key
let keyData = data.subdata(in: index..<index+MemoryLayout<Int>.size)
let key = keyData.withUnsafeBytes { $0.load(as: Int.self) }
index += MemoryLayout<Int>.size
// Read the document length and document
let documentLengthData = data.subdata(in: index..<index+MemoryLayout<Int>.size)
let documentLength = documentLengthData.withUnsafeBytes { $0.load(as: Int.self) }
index += MemoryLayout<Int>.size
let documentData = data.subdata(in: index..<index+documentLength)
guard let document = String(data: documentData, encoding: .utf8) else {
print("Failed to decode string")
continue // Skip this entry on failure
}
index += documentLength
// Read the vector
let vectorLengthData = data.subdata(in: index..<index+MemoryLayout<Int>.size)
let vectorLength = vectorLengthData.withUnsafeBytes { $0.load(as: Int.self) }
index += MemoryLayout<Int>.size
var vector = [Scalar]()
for _ in 0..<vectorLength {
let scalarData = data.subdata(in: index..<index+MemoryLayout<Scalar>.size)
let scalar = scalarData.withUnsafeBytes { $0.load(as: Scalar.self) }
vector.append(scalar)
index += MemoryLayout<Scalar>.size
}
// Add the key-value pair to the dictionary
dictionary[key] = DocumentVectorPair(untokenizedDocument: document, vector: vector)
}
} catch {
print("Error reading dictionary from file: \(error)")
}
return dictionary
}
}
#endif
// Copyright (c) 2024 Jim Wallace
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// Created by Mingchung Xia on 2024-03-16.
//
#if os(macOS)
import Foundation
/// HNSWCorpus iterates through its dictionary of key to document vector pairs
extension DurableHNSWCorpus: Sequence, Collection {
// Sequence Protocol Requirements
@inlinable
func makeIterator() -> AnyIterator<DocumentVectorPair> {
var iterator = dictionary.values.makeIterator()
return AnyIterator {
return iterator.next()
}
}
// Collection Protocol Requirements
@inlinable
var startIndex: Int {
return dictionary.keys.sorted().startIndex
}
@inlinable
var endIndex: Int {
return dictionary.keys.sorted().endIndex
}
@inlinable
subscript(position: Int) -> DocumentVectorPair {
let key = dictionary.keys.sorted()[position]
guard let pair = dictionary[key] else {
fatalError("Key \(key) not found in HNSW dictionary")
}
return pair
}
@inlinable
func index(after i: Int) -> Int {
return dictionary.keys.sorted().index(after: i)
}
}
#endif
// Copyright (c) 2024 Jim Wallace
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// Created by Mingchung Xia on 2024-02-26.
//
// MARK: DurableHNSWCorpus is only available on MacOS, not Linux
#if os(macOS)
import Foundation
import CoreLMDB
import CoreLMDBCoders
// MARK: DurableHNSWCorpus cannot conform to SNLPCorpus under its current definition
// This is because addingUntokenizedDocuments in a DurableHNSWCorpus requires an additional parameter (transaction) and can throw
final class DurableHNSWCorpus<Scalar: BinaryFloatingPoint & Codable & UnsafeMemoryLayoutStorableFloat> {
public typealias HNSWDictionary = [Int: DocumentVectorPair]
internal var _documentEncoder: any SNLPEncoder
var zeroes: [Scalar] { _documentEncoder.zeroes as! [Scalar] }
var encodedDocuments: DeterministicDurableVectorIndex<Scalar>
var count: Int { encodedDocuments.size }
// Keeps track of the original document for client code
var dictionary: HNSWDictionary = [:]
// typicalNeighbourhoodSize = 20 is a standard benchmark
init(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, scalar: Scalar.Type = Double.self, typicalNeighborhoodSize: Int = 20, namespace: String = "hnsw", in transaction: Transaction) throws {
_documentEncoder = ContextFreeEncoder(source: encoding)
encodedDocuments = try DeterministicDurableVectorIndex<Scalar>(
namespace: namespace,
typicalNeighborhoodSize: typicalNeighborhoodSize,
in: transaction
)
}
init(encoder: any SNLPEncoder, scalar: Scalar.Type = Double.self, typicalNeighborhoodSize: Int = 20, namespace: String = "hnsw", in transaction: Transaction) throws {
_documentEncoder = encoder
encodedDocuments = try DeterministicDurableVectorIndex<Scalar>(
namespace: namespace,
typicalNeighborhoodSize: typicalNeighborhoodSize,
in: transaction
)
}
@inlinable
func addUntokenizedDocument(_ document: String, in transaction: Transaction) throws {
/// forced unwrap as! [Scalar] is needed when we use SNLPEncoder but not ContextFreeEncoder
/// encodedDocuments.insert will insert and return the corresponding key (id)s
let encodedVector = _documentEncoder.encodeSentence(document) as! [Scalar]
let key = try encodedDocuments.insert(encodedVector, in: transaction)
addDocumentVectorPair(
at: key,
document: document,
vector: encodedVector
)
}
}
#endif
//// Copyright (c) 2024 Jim Wallace
////
//// Permission is hereby granted, free of charge, to any person
//// obtaining a copy of this software and associated documentation
//// files (the "Software"), to deal in the Software without
//// restriction, including without limitation the rights to use,
//// copy, modify, merge, publish, distribute, sublicense, and/or sell
//// copies of the Software, and to permit persons to whom the
//// Software is furnished to do so, subject to the following
//// conditions:
////
//// The above copyright notice and this permission notice shall be
//// included in all copies or substantial portions of the Software.
////
//// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
//// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
//// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
//// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
//// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
//// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
//// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
//// OTHER DEALINGS IN THE SOFTWARE.
////
//// Created by Mingchung Xia on 2024-02-07.
////
//
//import Foundation
//
//// MARK: Decodable conformance is in HNSWCorpus
//
//extension EphemeralHNSWCorpus: Codable {
// enum CodingKeys: String, CodingKey {
// case _documentEncoder
// case encodedDocuments
// case dictionary
// }
//
// func encode(to encoder: Encoder) throws {
// var container = encoder.container(keyedBy: CodingKeys.self)
// try container.encode(_documentEncoder, forKey: ._documentEncoder)
// try container.encode(encodedDocuments, forKey: .encodedDocuments)
// try container.encode(dictionary, forKey: .dictionary)
// }
//}
// Copyright (c) 2024 Jim Wallace
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// Created by Mingchung Xia on 2024-02-14.
//
import Foundation
extension EphemeralHNSWCorpus {
/// This extension is used for the dictionary operations
public struct DocumentVectorPair {
var untokenizedDocument: String
var vector: [Scalar]
init(untokenizedDocument: String, vector: [Scalar]) {
self.untokenizedDocument = untokenizedDocument
self.vector = vector
}
}
@inlinable
func getUntokenizedDocument(at key: Int) -> String {
if let pair = dictionary[key] {
return pair.untokenizedDocument
} else {
fatalError("Key \(key) not found in HNSW dictionary")
}
}
@inlinable
func getVector(at key: Int) -> [Scalar] {
if let pair = dictionary[key] {
return pair.vector
} else {
fatalError("Key \(key) not found in HNSW dictionary")
}
}
@inlinable
func getDictionary() -> [Int: DocumentVectorPair] {
return dictionary
}
func addDocumentVectorPair(at key: Int, document: String, vector: [Scalar]) {
dictionary[key] = DocumentVectorPair(
untokenizedDocument: document,
vector: vector
)
}
}
extension EphemeralHNSWCorpus.DocumentVectorPair: Codable where Scalar: Codable {
enum CodingKeys: String, CodingKey {
case untokenizedDocument
case vector
}
internal init(from decoder: Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
untokenizedDocument = try container.decode(String.self, forKey: .untokenizedDocument)
vector = try container.decode([Scalar].self, forKey: .vector)
}
internal func encode(to encoder: Encoder) throws {
var container = encoder.container(keyedBy: CodingKeys.self)
try container.encode(untokenizedDocument, forKey: .untokenizedDocument)
try container.encode(vector, forKey: .vector)
}
}
...@@ -20,37 +20,47 @@ ...@@ -20,37 +20,47 @@
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE. // OTHER DEALINGS IN THE SOFTWARE.
//
// Created by Mingchung Xia on 2024-02-14.
//
extension HNSWCorpus: Sequence { import Foundation
typealias Element = [Scalar] /// HNSWCorpus iterates through its dictionary of key to document vector pairs
extension EphemeralHNSWCorpus: Sequence, Collection {
// Sequence Protocol Requirements // Sequence Protocol Requirements
@inlinable @inlinable
func makeIterator() -> Dictionary<Int, [Scalar]>.Values.Iterator { func makeIterator() -> AnyIterator<DocumentVectorPair> {
return encodedDocuments.values.makeIterator() var iterator = dictionary.values.makeIterator()
return AnyIterator {
return iterator.next()
}
} }
// Collection Protocol Requirements // Collection Protocol Requirements
@inlinable @inlinable
var startIndex: Dictionary<Int, [Scalar]>.Index { var startIndex: Int {
return encodedDocuments.startIndex return dictionary.keys.sorted().startIndex
} }
@inlinable @inlinable
var endIndex: Dictionary<Int, [Scalar]>.Index { var endIndex: Int {
return encodedDocuments.endIndex return dictionary.keys.sorted().endIndex
} }
@inlinable @inlinable
subscript(position: Dictionary<Int, [Scalar]>.Index) -> [Scalar] { subscript(position: Int) -> DocumentVectorPair {
encodedDocuments.values[position] let key = dictionary.keys.sorted()[position]
guard let pair = dictionary[key] else {
fatalError("Key \(key) not found in HNSW dictionary")
}
return pair
} }
@inlinable @inlinable
func index(after i: Dictionary<Int, [Scalar]>.Index) -> Dictionary<Int, [Scalar]>.Index { func index(after i: Int) -> Int {
return encodedDocuments.index(after: i) return dictionary.keys.sorted().index(after: i)
} }
} }
// Copyright (c) 2024 Jim Wallace
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// The HNSW work is based on the original work of Jaden Geller
// See the https://github.com/JadenGeller/similarity-topology.git
// for reference. The code is used with permission from the author
// under the MIT License.
//
// Created by Mingchung Xia on 2024-02-14.
//
import Foundation
// MARK: Allow EphemeralHNSWCorpus to simply be used as HNSWCorpus
typealias HNSWCorpus = EphemeralHNSWCorpus
final class EphemeralHNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus {
public typealias HNSWDictionary = [Int: DocumentVectorPair]
internal var _documentEncoder: any SNLPEncoder
var zeroes: [Scalar] { _documentEncoder.zeroes as! [Scalar] }
var encodedDocuments: DeterministicEphemeralVectorIndex<[Scalar]>
var count: Int { encodedDocuments.base.vectors.count }
// Keeps track of the original document for client code
var dictionary: HNSWDictionary = [:]
// typicalNeighbourhoodSize = 20 is a standard benchmark
init(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, scalar: Scalar.Type = Double.self,
typicalNeighborhoodSize: Int = 20) {
_documentEncoder = ContextFreeEncoder(source: encoding)
encodedDocuments = DeterministicEphemeralVectorIndex<[Scalar]>(typicalNeighborhoodSize: typicalNeighborhoodSize)
}
init(encoder: any SNLPEncoder, scalar: Scalar.Type = Double.self, typicalNeighborhoodSize: Int = 20) {
_documentEncoder = encoder
encodedDocuments = DeterministicEphemeralVectorIndex<[Scalar]>(typicalNeighborhoodSize: typicalNeighborhoodSize)
}
// // Decodable conformance
// required init(from decoder: Decoder) throws {
// let container = try decoder.container(keyedBy: CodingKeys.self)
// _documentEncoder = try container.decode(ContextFreeEncoder<Scalar>.self, forKey: ._documentEncoder)
// encodedDocuments = try container.decode(DeterministicEphemeralVectorIndex<[Scalar]>.self, forKey: .encodedDocuments)
// dictionary = try container.decode(HNSWDictionary.self, forKey: .dictionary)
// }
@inlinable
func addUntokenizedDocument(_ document: String) {
/// forced unwrap as! [Scalar] is needed when we use SNLPEncoder but not ContextFreeEncoder
/// encodedDocuments.insert will insert and return the corresponding key (id)
let key = encodedDocuments.insert((_documentEncoder.encodeSentence(document)) as! [Scalar])
addDocumentVectorPair(
at: key,
document: document,
vector: encodedDocuments.base.vectors[key]
)
}
}
// Copyright (c) 2024 Jim Wallace
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// Created by Mingchung Xia on 2024-02-26.
//
#if os(macOS)
import Foundation
import PriorityHeapModule
import PriorityHeapAlgorithms
import SimilarityMetric
import HNSWAlgorithm
import HNSWDurable
import CoreLMDB
import CoreLMDBCoders
// MARK: This uses the persistent DurableVectorIndex
extension DurableVectorIndex {
public typealias Neighbor = NearbyVector<DurableVectorIndex.Accessor.CompoundKey, Metric.Vector, Metric.Similarity>
}
public struct DeterministicDurableVectorIndex<VectorComponent: UnsafeMemoryLayoutStorableFloat> where VectorComponent: Codable {
public typealias Vector = [VectorComponent]
public typealias Index = DurableVectorIndex<CartesianDistanceMetric<Vector>, Vector.Element>
public var base: Index
public var typicalNeighborhoodSize: Int
public var size: Int = 0 // TODO: This size is not set when read from LMDB
private var rng: RandomNumberGenerator
public init(namespace: String, typicalNeighborhoodSize: Int = 20, in transaction: Transaction) throws {
let metric = CartesianDistanceMetric<Vector>()
let config = Config.unstableDefault(typicalNeighborhoodSize: typicalNeighborhoodSize)
self.base = try Index(
namespace: namespace,
metric: metric,
config: config,
in: transaction
)
self.typicalNeighborhoodSize = typicalNeighborhoodSize
self.rng = SeedableRNG(seed: 1)
}
public func find(near query: Vector, limit: Int, exact: Bool = false, in transaction: Transaction) throws -> [Index.Neighbor] {
if exact {
// TODO: Exact search logic
fatalError("Exact search logic for DeterministicDurableVectorIndex is not currently supported")
} else {
let accessor = try Index.Accessor(for: base, in: transaction)
return Array(try accessor.find(near: query, limit: limit))
}
}
@discardableResult
public mutating func insert(_ vector: Vector, in transaction: Transaction) throws -> Int {
defer { size += 1 }
let accessor = try Index.Accessor(for: base, in: transaction)
let key = String(size)
accessor.insert(vector, forKey: key, using: &rng)
return self.size
}
}
#endif
//// Copyright (c) 2024 Jim Wallace
////
//// Permission is hereby granted, free of charge, to any person
//// obtaining a copy of this software and associated documentation
//// files (the "Software"), to deal in the Software without
//// restriction, including without limitation the rights to use,
//// copy, modify, merge, publish, distribute, sublicense, and/or sell
//// copies of the Software, and to permit persons to whom the
//// Software is furnished to do so, subject to the following
//// conditions:
////
//// The above copyright notice and this permission notice shall be
//// included in all copies or substantial portions of the Software.
////
//// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
//// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
//// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
//// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
//// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
//// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
//// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
//// OTHER DEALINGS IN THE SOFTWARE.
////
//// Created by Mingchung Xia on 2024-02-07.
////
//
//import Foundation
//
//extension DeterministicEphemeralVectorIndex: Encodable where Vector: Encodable {
// enum CodingKeys: String, CodingKey {
// case typicalNeighborhoodSize
// case vectors
// }
//
// public func encode(to encoder: Encoder) throws {
// var container = encoder.container(keyedBy: CodingKeys.self)
// try container.encode(typicalNeighborhoodSize, forKey: .typicalNeighborhoodSize)
// try container.encode(base.vectors, forKey: .vectors)
// }
//}
//
//extension DeterministicEphemeralVectorIndex: Decodable where Vector: Decodable {
// public init(from decoder: Decoder) throws {
// let container = try decoder.container(keyedBy: CodingKeys.self)
// let typicalNeighborhoodSize = try container.decode(Int.self, forKey: .typicalNeighborhoodSize)
// let vectors = try container.decode([Vector].self, forKey: .vectors)
//
// self.init(typicalNeighborhoodSize: typicalNeighborhoodSize)
// for vector in vectors {
// self.insert(vector)
// }
// }
//}
//
//
...@@ -20,82 +20,61 @@ ...@@ -20,82 +20,61 @@
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING // WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE. // OTHER DEALINGS IN THE SOFTWARE.
//
// Created by Mingchung Xia on 2024-01-28.
//
import Foundation import Foundation
import PriorityHeapModule import PriorityHeapModule
import PriorityHeapAlgorithms import PriorityHeapAlgorithms
import SimilarityMetric
import HNSWAlgorithm import HNSWAlgorithm
import HNSWEphemeral import HNSWEphemeral
// MARK: This uses the temporary EmphermalVectorIndex
class HNSWCorpus<Scalar: BinaryFloatingPoint & Codable>: SNLPCorpus { public struct DeterministicEphemeralVectorIndex<Vector: Collection & Codable> where Vector.Element: BinaryFloatingPoint {
var _documentEncoder: ContextFreeEncoder<Scalar>
var zeroes: [Scalar]
var count: Int { 0 }
var encodedDocuments: [Int : [Scalar]] = [:] // TODO: This should be replaced by HNSW
init(_documentEncoder: ContextFreeEncoder<Scalar>) {
self._documentEncoder = _documentEncoder
zeroes = Array(repeating: Scalar(0), count: 384)
}
@inlinable public typealias Index = EphemeralVectorIndex<Int, Int, CartesianDistanceMetric<Vector>, Void>
func addUntokenizedDocument(_ document: String) {
fatalError("HNSWCorpus not implemented yet. Get on it.")
}
// var index = DeterministicSampleVectorIndex(typicalNeighborhoodSize: 20)
// for _ in 0..<100 {
// index.insertRandom(range: 0...1)
// }
//
// for i in 0..<10 {
// let sample = index.generateRandom(range: 0...1)
// print("iter \(i): \(sample)")
// let hnswResults = try! index.find(near: sample, limit: 10)
// let exactResult = try! index.find(near: sample, limit: 1, exact: true)
// XCTAssert(exactResult.contains(where: { $0.id == hnswResults[0].id }))
// }
}
public struct DeterministicSampleVectorIndex<Vector: Collection & Codable> where Vector.Element: BinaryFloatingPoint {
public typealias Index = EphemeralVectorIndex<Int, Int, CartesianDistanceMetric<[Double]>, Void>
public var base: Index public var base: Index
public var typicalNeighborhoodSize: Int
public init(typicalNeighborhoodSize: Int) { private var vectorRNG: RandomNumberGenerator
base = .init(metric: .init(), config: .unstableDefault(typicalNeighborhoodSize: typicalNeighborhoodSize)) private var graphRNG: RandomNumberGenerator
public init(typicalNeighborhoodSize: Int = 20) {
base = .init(metric: CartesianDistanceMetric<Vector>(), config: .unstableDefault(typicalNeighborhoodSize: typicalNeighborhoodSize))
self.typicalNeighborhoodSize = typicalNeighborhoodSize
self.vectorRNG = SeedableRNG(seed: 0)
self.graphRNG = SeedableRNG(seed: 1)
} }
public func find(near query: Vector, limit: Int, exact: Bool = false) throws -> [Index.Neighbor] { public func find(near query: Vector, limit: Int, exact: Bool = false) throws -> [Index.Neighbor] {
if exact { if exact {
Array(PriorityHeap(base.vectors.enumerated().map { return Array(PriorityHeap(base.vectors.enumerated().map {
let similarity = base.metric.similarity(between: query as! [Double], $0.element) let similarity = base.metric.similarity(between: query, $0.element)
return NearbyVector(id: $0.offset, vector: $0.element, priority: similarity) return NearbyVector(id: $0.offset, vector: $0.element, priority: similarity)
}).descending().prefix(limit)) }).descending().prefix(limit))
} else { } else {
Array(try base.find(near: query as! [Double], limit: limit)) return Array(try base.find(near: query, limit: limit))
} }
} }
} public mutating func generateRandom(range: ClosedRange<Double>) -> CGPoint {
CGPoint(
public struct CartesianDistanceMetric<Vector: Collection & Codable>: SimilarityMetric where Vector.Element: BinaryFloatingPoint{ x: .random(in: range, using: &vectorRNG),
public func similarity(between someItem: Vector, _ otherItem: Vector) -> Vector.Element { y: .random(in: range, using: &vectorRNG)
// Naïve cartesian distance )
let squaredSum = zip(someItem, otherItem) }
.map { (x, y) in (x - y) * (x - y) }
.reduce(0, +) @discardableResult
public mutating func insert(_ vector: Vector) -> Int {
return sqrt(squaredSum) let convertedVector: [Double] = vector.map{ Double($0) }
if let metricVector = convertedVector as? CartesianDistanceMetric<Vector>.Vector {
/// base.insert will returns a key and inserts the vector into the index
let key = base.insert(metricVector, using: &graphRNG)
return key
} else {
fatalError("Unable to get metric vector")
}
} }
} }
//// Copyright (c) 2024 Jim Wallace
////
//// Permission is hereby granted, free of charge, to any person
//// obtaining a copy of this software and associated documentation
//// files (the "Software"), to deal in the Software without
//// restriction, including without limitation the rights to use,
//// copy, modify, merge, publish, distribute, sublicense, and/or sell
//// copies of the Software, and to permit persons to whom the
//// Software is furnished to do so, subject to the following
//// conditions:
////
//// The above copyright notice and this permission notice shall be
//// included in all copies or substantial portions of the Software.
////
//// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
//// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
//// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
//// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
//// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
//// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
//// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
//// OTHER DEALINGS IN THE SOFTWARE.
////
//// Created by Mingchung Xia on 2024-02-13.
////
//
//// MARK: This is outdated since we now have the presence of a DurableHNSWCorpus but still available for reference
//
//import Foundation
//
//final class HNSWCorpusDataHandler<Scalar: BinaryFloatingPoint & Codable> {
// var corpus: HNSWCorpus<Scalar>
// private var url: URL?
//
// init(corpus: HNSWCorpus<Scalar>, resource: String = "hnsw") {
// self.corpus = corpus
//// self.url = Bundle.module.url(forResource: resource, withExtension: "mmap")
// if let downloadsDirectory = FileManager.default.urls(for: .downloadsDirectory, in: .userDomainMask).first {
// self.url = downloadsDirectory.appendingPathComponent(resource + ".mmap")
// }
// }
//
// /// It is very difficult to get the exact size of the corpus as every class also depends on other classes
// /// The size of the memory map may not even be correct if it only stores the vectors, and the vectors are really the only "important" part
// func getCorpusSize() -> Int {
//// return heapSize(corpus)
//// return class_getInstanceSize(type(of: corpus))
//// return MemoryLayout.size(ofValue: corpus)
// var size = 0
// let data = corpus.encodedDocuments.base.vectors
// for vector in data {
// size += MemoryLayout.size(ofValue: vector)
// }
// return size
// }
//
// func getDictionarySize(includeKey: Bool = true) -> Int {
// var size = 0
// let data = corpus.getDictionary()
// for (key, documentVectorPair) in data {
// if includeKey { size += MemoryLayout.size(ofValue: key) }
// size += MemoryLayout.size(ofValue: documentVectorPair.untokenizedDocument)
// size += MemoryLayout.size(ofValue: documentVectorPair.vector)
// }
// return size
// }
//
// private func heapSize(_ obj: AnyObject) -> Int {
// return malloc_size(Unmanaged.passUnretained(obj).toOpaque())
// }
//}
//
//extension HNSWCorpusDataHandler {
// func saveMemoryMap() {
// guard let url = url else {
// print("URL to resource not found")
// return
// }
// let fileManager = FileManager.default
// if !fileManager.fileExists(atPath: url.path) {
// fileManager.createFile(atPath: url.path, contents: nil, attributes: nil)
// }
// do {
//// let fileHandle = try FileHandle(forWritingTo: url)
////
//// let count = corpus.count
//// let countData = withUnsafeBytes(of: count) { Data($0) }
//// fileHandle.write(countData)
////
//// for pair in corpus {
//// let documentData = pair.untokenizedDocument.utf8CString.withUnsafeBufferPointer { Data(buffer: $0) }
//// fileHandle.write(documentData)
//// }
//// fileHandle.closeFile()
//
// print("Saving HNSW to file...")
// /// Using the Codable conformances
// let encoder = JSONEncoder()
// let encoded = try encoder.encode(corpus)
// try encoded.write(to: url)
// } catch {
// print("Error writing HNSW to file: \(error)")
// }
// }
//
// /// This saves only the untokenized documents dictionary map
// func saveDictionaryMemoryMap() {
// // TODO: Move from DurableHNSW extension once HNSW wrapper is created
// }
//
// // TODO: find out how to not rebuild the index
// static func loadMemoryMap(encoder: any SNLPEncoder, typicalNeighborhoodSize: Int = 20, resource: String = "hnsw") -> HNSWCorpus<Double> {
// guard let url = Bundle.module.url(forResource: resource, withExtension: "mmap") else {
// print("URL to resource not found")
// return HNSWCorpus(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize)
// }
//
// var loadedCorpus = HNSWCorpus(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize)
//
// do {
//// let data = try Data(contentsOf: url, options: .alwaysMapped)
//// let countData = data.prefix(MemoryLayout<Int>.size)
//// let count: Int = countData.withUnsafeBytes { $0.load(as: Int.self) }
//// var index = MemoryLayout<Int>.size
////
//// for _ in 0..<count {
//// if let stringRange = data[index...].range(of: "\0".data(using: .utf8)!) {
//// let documentData = data[index..<stringRange.lowerBound]
//// if let document = String(data: documentData, encoding: .utf8) {
//// // Add the untokenized document to the corpus
//// loadedCorpus.addUntokenizedDocument(document)
//// index = stringRange.upperBound
//// }
//// } else {
//// break
//// }
//// }
//
// /// Using the Codable conformances
// print("Loading HNSW from file...")
// let decoder = JSONDecoder()
// let data = try Data(contentsOf: url)
// loadedCorpus = try decoder.decode(HNSWCorpus<Double>.self, from: data)
// } catch {
// print("Error reading HNSW from file: \(error)")
// }
// return loadedCorpus
// }
//
// static func loadMemoryMap(encoding: ContextFreeEncoder<Scalar>.PreComputedEmbeddings, typicalNeighborhoodSize: Int = 20, resource: String = "hnsw") -> HNSWCorpus<Double> {
// let encoder = ContextFreeEncoder<Scalar>(source: encoding)
// return loadMemoryMap(encoder: encoder, typicalNeighborhoodSize: typicalNeighborhoodSize, resource: resource)
// }
//}
// Copyright (c) 2024 Jim Wallace
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// Created by Mingchung Xia on 2024-01-28.
//
import Foundation
import SimilarityMetric
#if canImport(Surge) && canImport(Accelerate) && os(macOS)
import Surge
import Accelerate
public struct CartesianDistanceMetric<Vector: Collection & Codable>: SimilarityMetric where Vector.Element: BinaryFloatingPoint {
public func similarity(between someItem: Vector, _ otherItem: Vector) -> Vector.Element {
return Vector.Element(Surge.distSq(someItem as! [Double], otherItem as! [Double]))
}
}
#else
//import Nifty
// MARK: Nifty is too outdated to retrofit our code, even after updating its swift-tools-version to 5.9.
/// This implementation may be less efficient on Linux
public struct CartesianDistanceMetric<Vector: Collection & Codable>: SimilarityMetric where Vector.Element: BinaryFloatingPoint {
public func similarity(between someItem: Vector, _ otherItem: Vector) -> Vector.Element {
var sum: Vector.Element = 0
for (a, b) in zip(someItem, otherItem) {
let difference = a - b
sum += difference * difference
}
return sum
}
}
#endif
// Copyright (c) 2024 Jim Wallace
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// Created by Mingchung Xia on 2024-03-14.
//
import Foundation
import SimilarityMetric
#if canImport(Surge) && canImport(Accelerate) && os(macOS)
import Surge
import Accelerate
public struct CosineSimilarityMetric<Vector: Collection & Codable>: SimilarityMetric where Vector.Element: BinaryFloatingPoint {
public func similarity(between someItem: Vector, _ otherItem: Vector) -> Vector.Element {
// Convert vectors to arrays of Double
let someItemDoubles = someItem.map { Double($0) }
let otherItemDoubles = otherItem.map { Double($0) }
// Calculate dot product using Surge for cosine similarity numerator
let dotProduct = Surge.dot(someItemDoubles, otherItemDoubles)
// Manually calculate magnitudes (norms) of the vectors for the denominator
let someItemMagnitude = sqrt(Surge.dot(someItemDoubles, someItemDoubles))
let otherItemMagnitude = sqrt(Surge.dot(otherItemDoubles, otherItemDoubles))
// Calculate cosine similarity
let cosineSimilarity = dotProduct / (someItemMagnitude * otherItemMagnitude)
// Convert back to type Vector.Element
return Vector.Element(cosineSimilarity)
}
}
#else
//import Nifty
// MARK: Nifty is too outdated to retrofit our code, even after updating its swift-tools-version to 5.9.
/// This implementation may be less efficient on Linux
public struct CosineSimilarityMetric<Vector: Collection & Codable>: SimilarityMetric where Vector.Element: BinaryFloatingPoint {
public func similarity(between someItem: Vector, _ otherItem: Vector) -> Vector.Element {
let dotProduct = zip(someItem, otherItem).reduce(0) { $0 + $1.0 * $1.1 }
let magnitudeSomeItem = sqrt(someItem.reduce(0) { $0 + $1 * $1 })
let magnitudeOtherItem = sqrt(otherItem.reduce(0) { $0 + $1 * $1 })
let cosineSimilarity = dotProduct / (magnitudeSomeItem * magnitudeOtherItem)
return cosineSimilarity
}
}
#endif
// Copyright (c) 2024 Jim Wallace
//
// Permission is hereby granted, free of charge, to any person
// obtaining a copy of this software and associated documentation
// files (the "Software"), to deal in the Software without
// restriction, including without limitation the rights to use,
// copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the
// Software is furnished to do so, subject to the following
// conditions:
//
// The above copyright notice and this permission notice shall be
// included in all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
// OTHER DEALINGS IN THE SOFTWARE.
//
// See the https://github.com/JadenGeller/similarity-topology.git
// for reference. The code is used with permission from the author
// under the MIT License.
//
// Created by Mingchung Xia on 2024-01-28.
//
#if canImport(GameplayKit) && os(macOS)
import Foundation
import GameplayKit
// MARK: GameplayKit provides a mersenne twister for RNG, but is not available on Linux
// See https://github.com/quells/Squall package for alternative mersenne twister
@available(macOS, introduced: 10.11)
struct MersenneTwisterRNG: RandomNumberGenerator {
private let randomSource: GKMersenneTwisterRandomSource
init(seed: UInt64) {
randomSource = GKMersenneTwisterRandomSource(seed: seed)
}
mutating func next() -> UInt64 {
let upperBits = UInt64(UInt32(bitPattern: Int32(randomSource.nextInt()))) << 32
let lowerBits = UInt64(UInt32(bitPattern: Int32(randomSource.nextInt())))
return upperBits | lowerBits
}
}
#endif
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment