Skip to content
Snippets Groups Projects
Commit 44048b9f authored by Jim Wallace's avatar Jim Wallace
Browse files

Test compile new zstd library

parent f7ae9342
No related branches found
No related tags found
No related merge requests found
Pipeline #108426 passed with warnings
{ {
"pins" : [ "pins" : [
{
"identity" : "elva",
"kind" : "remoteSourceControl",
"location" : "https://github.com/L1MeN9Yu/Elva",
"state" : {
"revision" : "9cd193dd5f0df430726256ab5d6aa08455f5d39b",
"version" : "2.1.3"
}
},
{ {
"identity" : "kdtree", "identity" : "kdtree",
"kind" : "remoteSourceControl", "kind" : "remoteSourceControl",
...@@ -44,6 +35,15 @@ ...@@ -44,6 +35,15 @@
"revision" : "8eeea249ead49f6f8db8e76bf66abce49cc5b856", "revision" : "8eeea249ead49f6f8db8e76bf66abce49cc5b856",
"version" : "1.0.0" "version" : "1.0.0"
} }
},
{
"identity" : "zstd.swift",
"kind" : "remoteSourceControl",
"location" : "https://github.com/awxkee/zstd.swift.git",
"state" : {
"revision" : "895dac6adf40f4adf6cd889dbff31958e6fa1203",
"version" : "1.0.1"
}
} }
], ],
"version" : 2 "version" : 2
......
...@@ -16,9 +16,11 @@ let package = Package( ...@@ -16,9 +16,11 @@ let package = Package(
dependencies: [ dependencies: [
.package(url: "https://github.com/Jounce/Surge.git", .upToNextMajor(from: "2.3.2")), .package(url: "https://github.com/Jounce/Surge.git", .upToNextMajor(from: "2.3.2")),
.package(url: "https://github.com/jbadger3/SwiftAnnoy", .upToNextMajor(from: "1.0.0")), .package(url: "https://github.com/jbadger3/SwiftAnnoy", .upToNextMajor(from: "1.0.0")),
.package(url: "https://github.com/L1MeN9Yu/Elva", .upToNextMajor(from: "2.0.0")), //.package(url: "https://github.com/L1MeN9Yu/Elva", .upToNextMajor(from: "2.0.0")),
.package(url: "https://github.com/Bersaelor/KDTree", .upToNextMajor(from: "1.4.1")), .package(url: "https://github.com/Bersaelor/KDTree", .upToNextMajor(from: "1.4.1")),
.package(url: "https://github.com/apple/swift-collections", .upToNextMinor(from: "1.0.0")), .package(url: "https://github.com/apple/swift-collections", .upToNextMinor(from: "1.0.0")),
.package(url: "https://github.com/awxkee/zstd.swift.git", from: "1.0.1"),
//.package(url: "https://github.com/facebook/zstd", .upToNextMajor(from: "1.5.5")),
], ],
targets: [ targets: [
.target( .target(
...@@ -26,7 +28,9 @@ let package = Package( ...@@ -26,7 +28,9 @@ let package = Package(
dependencies: [ dependencies: [
"Surge", "Surge",
"SwiftAnnoy", "SwiftAnnoy",
.product(name: "ZSTD", package: "Elva"), //"zstd",
//"libzstd",
.product(name: "zstd", package: "zstd.swift"),
"KDTree", "KDTree",
.product(name: "Collections", package: "swift-collections"), .product(name: "Collections", package: "swift-collections"),
], ],
......
////
//// File.swift
////
////
//// Created by Jim Wallace on 2023-08-31.
////
// //
// File.swift //// Download from: https://reddit-top20k.cworld.ai/
// //// e.g. https://reddit-archive.cworld.ai/AskReddit_submissions.zst
//// e.g. https://reddit-archive.cworld.ai/AskReddit_comments.zst
// //
// Created by Jim Wallace on 2023-08-31.
//
// Download from: https://reddit-top20k.cworld.ai/
// e.g. https://reddit-archive.cworld.ai/AskReddit_submissions.zst
// e.g. https://reddit-archive.cworld.ai/AskReddit_comments.zst
import Foundation import Foundation
import ZSTD import zstd
//import ZSTD
public typealias SubredditData = [String : RedditThread] //import zlib
public typealias SubredditID = String //
//public typealias SubredditData = [String : RedditThread]
//public typealias SubredditID = String
// Load data from all files, async //
public func downloadSubredditFromServer(subreddit: String, source: String = "https://reddit-archive.cworld.ai/", verbose: Bool = false) async throws -> SubredditData { //
//// Load data from all files, async
// TODO: Add another server? e.g., https://the-eye.eu/redarcs/files/ //public func downloadSubredditFromServer(subreddit: String, source: String = "https://reddit-archive.cworld.ai/", verbose: Bool = false) async throws -> SubredditData {
//
var result: SubredditData = [String : RedditThread]() // // TODO: Add another server? e.g., https://the-eye.eu/redarcs/files/
//
let submissionsURL = source + subreddit + "_submissions.zst" // var result: SubredditData = [String : RedditThread]()
let commentsURL = source + subreddit + "_comments.zst" //
// let submissionsURL = source + subreddit + "_submissions.zst"
debugPrint("Downloading \(submissionsURL)") // let commentsURL = source + subreddit + "_comments.zst"
debugPrint("Downloading \(commentsURL)") //
// debugPrint("Downloading \(submissionsURL)")
if let submissionsURL = URL(string: submissionsURL), // debugPrint("Downloading \(commentsURL)")
let commentsURL = URL(string: commentsURL) { //
do { // if let submissionsURL = URL(string: submissionsURL),
// let commentsURL = URL(string: commentsURL) {
// Download and process submissions and comments // do {
async let submissionsData = try Data(contentsOf: submissionsURL) //
async let commentsData = try Data(contentsOf: commentsURL) // // Download and process submissions and comments
// async let submissionsData = try Data(contentsOf: submissionsURL)
// async let commentsData = try Data(contentsOf: commentsURL)
// Once we have submissions data, //
let _ = try await submissionsData //
debugPrint("Processing submission data...") // // Once we have submissions data,
let (submissions, _ ): ([Submission],[Data]) = try await processRedditDataFile(submissionsData, verbose: verbose) // TODO: Figure out what to do with error data // let _ = try await submissionsData
for submission in submissions { // debugPrint("Processing submission data...")
// Create a new thread for each submission, index by submission ID // let (submissions, _ ): ([Submission],[Data]) = try await processRedditDataFile(submissionsData, verbose: verbose) // TODO: Figure out what to do with error data
result[submission.id!] = RedditThread(submission: submission, comments: [Comment]()) // for submission in submissions {
} // // Create a new thread for each submission, index by submission ID
debugPrint("Completed processing submissions.") // result[submission.id!] = RedditThread(submission: submission, comments: [Comment]())
// }
// Then fill in the comments once we have them.. // debugPrint("Completed processing submissions.")
let _ = try await commentsData //
debugPrint("Processing comments data...") // // Then fill in the comments once we have them..
let (comments, _ ): ([Comment],[Data]) = try await processRedditDataFile(commentsData, verbose: verbose) // TODO: Figure out what to do with error data // let _ = try await commentsData
for comment in comments { // debugPrint("Processing comments data...")
if var thread = result[comment.link_id!] { // let (comments, _ ): ([Comment],[Data]) = try await processRedditDataFile(commentsData, verbose: verbose) // TODO: Figure out what to do with error data
thread.add(comment) // for comment in comments {
} // if var thread = result[comment.link_id!] {
} // thread.add(comment)
debugPrint("Completed processing comments.") // }
// }
} catch { // debugPrint("Completed processing comments.")
print("Error downloading or loading data: \(error)") //
return result // } catch {
} // print("Error downloading or loading data: \(error)")
} // return result
// }
return result // }
} //
// return result
//}
// Load submission data //
public func downloadSubmissionsFromServer(subreddit: String, source: String = "https://reddit-archive.cworld.ai/", verbose: Bool = false) async throws -> [Submission] { //
//// Load submission data
var result = [Submission]() //public func downloadSubmissionsFromServer(subreddit: String, source: String = "https://reddit-archive.cworld.ai/", verbose: Bool = false) async throws -> [Submission] {
//
let submissionsURL = source + subreddit + "_submissions.zst" // var result = [Submission]()
debugPrint("Downloading \(submissionsURL)") //
// let submissionsURL = source + subreddit + "_submissions.zst"
if let submissionsURL = URL(string: submissionsURL) { // debugPrint("Downloading \(submissionsURL)")
do { //
// if let submissionsURL = URL(string: submissionsURL) {
// Download and process submissions and comments // do {
async let submissionsData = try Data(contentsOf: submissionsURL) //
// // Download and process submissions and comments
// Once we have submissions data, // async let submissionsData = try Data(contentsOf: submissionsURL)
let _ = try await submissionsData //
debugPrint("Processing submission data...") // // Once we have submissions data,
let (submissions, _ ): ([Submission],[Data]) = try await processRedditDataFile(submissionsData, verbose: verbose) // TODO: Figure out what to do with error data // let _ = try await submissionsData
debugPrint("Completed processing submissions.") // debugPrint("Processing submission data...")
// let (submissions, _ ): ([Submission],[Data]) = try await processRedditDataFile(submissionsData, verbose: verbose) // TODO: Figure out what to do with error data
result = submissions // debugPrint("Completed processing submissions.")
//
} catch { // result = submissions
print("Error downloading or loading data: \(error)") //
return result // } catch {
} // print("Error downloading or loading data: \(error)")
} // return result
return result // }
} // }
// return result
// Load comments data //}
public func downloadCommentsFromServer(subreddit: String, source: String = "https://reddit-archive.cworld.ai/", verbose: Bool = false) async throws -> [Comment] { //
//// Load comments data
var result = [Comment]() //public func downloadCommentsFromServer(subreddit: String, source: String = "https://reddit-archive.cworld.ai/", verbose: Bool = false) async throws -> [Comment] {
//
let commentsURL = source + subreddit + "_comments.zst" // var result = [Comment]()
debugPrint("Downloading \(commentsURL)") //
// let commentsURL = source + subreddit + "_comments.zst"
if let commentsURL = URL(string: commentsURL) { // debugPrint("Downloading \(commentsURL)")
do { //
// if let commentsURL = URL(string: commentsURL) {
// Download and process submissions and comments // do {
async let commentsData = try Data(contentsOf: commentsURL) //
// // Download and process submissions and comments
// Once we have submissions data, // async let commentsData = try Data(contentsOf: commentsURL)
let _ = try await commentsData //
debugPrint("Processing submission data...") // // Once we have submissions data,
let (comments, _ ): ([Comment],[Data]) = try await processRedditDataFile(commentsData, verbose: verbose) // TODO: Figure out what to do with error data // let _ = try await commentsData
debugPrint("Completed processing comments.") // debugPrint("Processing submission data...")
// let (comments, _ ): ([Comment],[Data]) = try await processRedditDataFile(commentsData, verbose: verbose) // TODO: Figure out what to do with error data
result = comments // debugPrint("Completed processing comments.")
//
} catch { // result = comments
print("Error downloading or loading data: \(error)") //
return result // } catch {
} // print("Error downloading or loading data: \(error)")
} // return result
return result // }
} // }
// return result
//}
// A parallelizable version of the file processing function //
func processRedditDataFile<C: Decodable>( _ fileData: Data, verbose: Bool = false) async throws -> (posts: [C], errors: [Data]) { //
//// A parallelizable version of the file processing function
var posts = [C]() //func processRedditDataFile<C: Decodable>( _ fileData: Data, verbose: Bool = false) async throws -> (posts: [C], errors: [Data]) {
var errorData = [Data]() //
// var posts = [C]()
//let fileData = try Data(contentsOf: fileURL) // var errorData = [Data]()
var splitData: [Data] //
// //let fileData = try Data(contentsOf: fileURL)
let decoder = JSONDecoder() // var splitData: [Data]
//
// let decoder = JSONDecoder()
//
if verbose { debugPrint("Decompressing file...") } //
let inputMemory = BufferedMemoryStream(startData: fileData) //
let decompressMemory = BufferedMemoryStream() // if verbose { debugPrint("Decompressing file...") }
// let inputMemory = BufferedMemoryStream(startData: fileData)
try ZSTD.decompress(reader: inputMemory, writer: decompressMemory, config: ZSTD.DecompressConfig.default) // let decompressMemory = BufferedMemoryStream()
let decompressedData = decompressMemory.representation //
if verbose { debugPrint("... decompressed.") } // try ZSTD.decompress(reader: inputMemory, writer: decompressMemory, config: ZSTD.DecompressConfig.default)
// let decompressedData = decompressMemory.representation
splitData = splitDataIntoLines(data: decompressedData) // if verbose { debugPrint("... decompressed.") }
//
// Error logging variables // splitData = splitDataIntoLines(data: decompressedData)
var lastData: Data? = nil //
// // Error logging variables
for data in splitData { // var lastData: Data? = nil
do { //
// Reset our error tracking variables // for data in splitData {
lastData = data // do {
posts.append( try decoder.decode(C.self, from: data) ) // // Reset our error tracking variables
// lastData = data
} catch { // posts.append( try decoder.decode(C.self, from: data) )
//numberOfErrors += 1 //
errorData.append(lastData!) // } catch {
} // //numberOfErrors += 1
} // errorData.append(lastData!)
// }
return (posts, errorData) // }
} //
// return (posts, errorData)
//}
//
func splitDataIntoLines(data: Data) -> [Data] { //
var lines = [Data]() //
var lineStart = data.startIndex //func splitDataIntoLines(data: Data) -> [Data] {
var lineEnd: Data.Index? // var lines = [Data]()
var current = data.startIndex // var lineStart = data.startIndex
while current < data.endIndex { // var lineEnd: Data.Index?
if data[current] == 10 { // ASCII newline // var current = data.startIndex
lineEnd = current // while current < data.endIndex {
var line = data[lineStart..<lineEnd!] // if data[current] == 10 { // ASCII newline
if line.last == 44 { line.removeLast() } // Remove trailing commas // lineEnd = current
lines.append(line) // var line = data[lineStart..<lineEnd!]
lineStart = data.index(after: current) // if line.last == 44 { line.removeLast() } // Remove trailing commas
} // lines.append(line)
current = data.index(after: current) // lineStart = data.index(after: current)
} // }
if lineStart < current { // current = data.index(after: current)
var line = data[lineStart..<current] // }
// if lineStart < current {
if line.last == 44 { line.removeLast() } // Remove trailing commas // var line = data[lineStart..<current]
//
lines.append(line) // if line.last == 44 { line.removeLast() } // Remove trailing commas
} //
return lines // lines.append(line)
} // }
// return lines
//}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment