diff --git a/Package.resolved b/Package.resolved index bcefeeef49d54251591a011e0fd0daeece9acc71..6f78ef2766d1092e033b37f88fd34df1b29c8d42 100644 --- a/Package.resolved +++ b/Package.resolved @@ -23,8 +23,8 @@ "kind" : "remoteSourceControl", "location" : "https://github.com/L1MeN9Yu/Elva", "state" : { - "revision" : "9cd193dd5f0df430726256ab5d6aa08455f5d39b", - "version" : "2.1.3" + "revision" : "c95092d9b5c7f39ea0aaf391a8df3cced4168b73", + "version" : "2.2.0" } }, { diff --git a/Sources/SwiftNLP/1. Data Collection/PushShift Archives/MMapFileReadStream.swift b/Sources/SwiftNLP/1. Data Collection/PushShift Archives/MMapFileReadStream.swift new file mode 100644 index 0000000000000000000000000000000000000000..0639a065f1a8eecfbea2f1f2bb3e33327ac3b1c9 --- /dev/null +++ b/Sources/SwiftNLP/1. Data Collection/PushShift Archives/MMapFileReadStream.swift @@ -0,0 +1,53 @@ +// Copyright (c) 2024 Jim Wallace +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +import Foundation +import ZSTD + +/** + A memory mapped version of the FileReadStream -- not clear to me whether this will be useful, just for experimenting + */ +public class MMapFileReadStream { + + private let data: Data + private var currentIndex = 0 + + public init(path: String) throws { + let url = URL(fileURLWithPath: path) + self.data = try Data(contentsOf: url, options: .alwaysMapped) + } +} + +extension MMapFileReadStream: ReadableStream { + public func read(_ buffer: UnsafeMutablePointer<UInt8>, length: Int) -> Int { + let bytesToRead = min(length, data.count - currentIndex) + data.copyBytes(to: buffer, from: currentIndex..<currentIndex + bytesToRead) + currentIndex += bytesToRead + return bytesToRead + } +} + +extension MMapFileReadStream: ByteStream { + public func close() { + } +} diff --git a/Sources/SwiftNLP/1. Data Collection/PushShift Archives/MMapFileWriteStream.swift b/Sources/SwiftNLP/1. Data Collection/PushShift Archives/MMapFileWriteStream.swift new file mode 100644 index 0000000000000000000000000000000000000000..99e829d4145e37ed7540c82dbeeeb708bff6ad3a --- /dev/null +++ b/Sources/SwiftNLP/1. Data Collection/PushShift Archives/MMapFileWriteStream.swift @@ -0,0 +1,46 @@ +// Copyright (c) 2024 Jim Wallace +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +import Foundation +import ZSTD + +public class MMapFileWriteStream { + private var data: Data + + public init(path: String) throws { + let url = URL(fileURLWithPath: path) + self.data = try Data(contentsOf: url, options: .alwaysMapped) + } +} + +public extension MMapFileWriteStream { + func close() { + } +} + +extension MMapFileWriteStream: WriteableStream { + public func write(_ data: UnsafePointer<UInt8>, length: Int) -> Int { + self.data.append(data, count: length) + return length + } +} diff --git a/Sources/SwiftNLP/1. Data Collection/PushShift Archives/MMapStreamingRedditArchiveDecoder.swift b/Sources/SwiftNLP/1. Data Collection/PushShift Archives/MMapStreamingRedditArchiveDecoder.swift new file mode 100644 index 0000000000000000000000000000000000000000..d2b3d3e13855f342d9c2ead467b38066a08a3c85 --- /dev/null +++ b/Sources/SwiftNLP/1. Data Collection/PushShift Archives/MMapStreamingRedditArchiveDecoder.swift @@ -0,0 +1,155 @@ +// Copyright (c) 2024 Jim Wallace +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +import Foundation +import ZSTD + + +/** + Implements a BufferedMemoryStream that can be used to pull out JSON objects on the fly as part of an Elva streaming decompression chain + */ +public class MMapStreamingRedditArchiveDecoder<T: Decodable> { + + // Same functionality as Elva's BufferedMemoryStream + public private(set) var representation: Data = Data() + private var readerIndex: Int = 0 + + // Our variables: + private var filter: (T) -> Bool /// Determine whether we should keep each element, defaults to keep everything + public var memoryStore: [T]? = nil /// Where to put output if we want to keep it in memory + private var outputStream: WriteableStream? = nil /// Where to write output to if we just want to keep streaming it, e.g., possibly a `FileWriteStream` + public var errorStore: [Data] = [Data]() /// Save any errors so that we can fix decoding ... remove this later? + + // We'll use this to pull out objects + private let decoder = JSONDecoder() + + + /** + Initalizer that immediately decompresses the stream + */ + public init(from: ReadableStream, + to: WriteableStream, + filter: @escaping (T) -> Bool = { _ in true }, + config: ZSTD.DecompressConfig = ZSTD.DecompressConfig.default) + throws { + outputStream = to + self.filter = filter + try ZSTD.decompress(reader: from, writer: self, config: config) + } + + /** + Initalizer that immediately decompresses the stream and stores its items in an array + */ + public init(from: ReadableStream, + filter: @escaping (T) -> Bool = { _ in true }, + config: ZSTD.DecompressConfig = ZSTD.DecompressConfig.default + ) throws { + memoryStore = [T]() + self.filter = filter + try ZSTD.decompress(reader: from, writer: self, config: config) + } + + /** + Initalizer that immediately decompresses the file and stores its items in an array + */ + public init(path: String, + filter: @escaping (T) -> Bool = { _ in true }, + config: ZSTD.DecompressConfig = ZSTD.DecompressConfig.default + ) throws { + + let i = try? MMapFileReadStream(path: path) + + guard let i = i else { + throw NSError(domain: "com.example.generic", code: 0, userInfo: [NSLocalizedDescriptionKey: "A generic error occurred."]) // TODO: Clean this up + } + + memoryStore = [T]() + self.filter = filter + try ZSTD.decompress(reader: i, writer: self, config: config) + } + + /** + Initalizer that immediately decompresses the file and stores its items in an array + */ + public init(inputPath: String, + outputPath: String, + filter: @escaping (T) -> Bool = { _ in true } + ) throws { + + let i = try? MMapFileReadStream(path: inputPath) + let o = try? MMapFileWriteStream(path: outputPath) + + guard let i = i, let o = o else { + throw NSError(domain: "com.example.generic", code: 0, userInfo: [NSLocalizedDescriptionKey: "A generic error occurred."]) // TODO: Clean this up + } + + self.outputStream = o + self.filter = filter + + let config = ZSTD.DecompressConfig(parameters: [.windowLogMax(31)]) + + try ZSTD.decompress(reader: i, writer: self, config: config) + } + +} + + +extension MMapStreamingRedditArchiveDecoder: WriteableStream { + public func write(_ data: UnsafePointer<UInt8>, length: Int) -> Int { + representation += Data(bytes: data, count: length) + + // While we have newlines, work through the file + while let range = representation.range(of: Data([0x0A])) { + let chunk = representation.subdata(in: 0..<range.lowerBound + 1) + representation.removeSubrange(0..<range.lowerBound + 1) + + let item = try? decoder.decode(T.self, from: chunk) + + // If we found an error, save the data + if item == nil { + errorStore.insert(chunk, at: errorStore.count) + } + + if let item = item, filter(item) { + if memoryStore != nil { + memoryStore!.insert(item, at: memoryStore!.count) + } + if let output = outputStream { + let size = chunk.count + let buffer = UnsafeMutablePointer<UInt8>.allocate(capacity: size) + chunk.copyBytes(to: buffer, count: size) + let _ = output.write(buffer, length: size) + } + } + } + + return length + } + + public func close() { + if let out = outputStream { + out.close() + } + } + +} diff --git a/Sources/SwiftNLP/1. Data Collection/Pushshift Archives/RedditCommentData.swift b/Sources/SwiftNLP/1. Data Collection/PushShift Archives/RedditCommentData.swift similarity index 100% rename from Sources/SwiftNLP/1. Data Collection/Pushshift Archives/RedditCommentData.swift rename to Sources/SwiftNLP/1. Data Collection/PushShift Archives/RedditCommentData.swift diff --git a/Sources/SwiftNLP/1. Data Collection/Pushshift Archives/RedditContainer.swift b/Sources/SwiftNLP/1. Data Collection/PushShift Archives/RedditContainer.swift similarity index 100% rename from Sources/SwiftNLP/1. Data Collection/Pushshift Archives/RedditContainer.swift rename to Sources/SwiftNLP/1. Data Collection/PushShift Archives/RedditContainer.swift diff --git a/Sources/SwiftNLP/1. Data Collection/Pushshift Archives/RedditSubmissionData.swift b/Sources/SwiftNLP/1. Data Collection/PushShift Archives/RedditSubmissionData.swift similarity index 100% rename from Sources/SwiftNLP/1. Data Collection/Pushshift Archives/RedditSubmissionData.swift rename to Sources/SwiftNLP/1. Data Collection/PushShift Archives/RedditSubmissionData.swift diff --git a/Sources/SwiftNLP/1. Data Collection/PushShift Archives/StreamingRedditArchiveDecoder.swift b/Sources/SwiftNLP/1. Data Collection/PushShift Archives/StreamingRedditArchiveDecoder.swift new file mode 100644 index 0000000000000000000000000000000000000000..88f74fb547d7afb0484286a27af6181c429f4ad2 --- /dev/null +++ b/Sources/SwiftNLP/1. Data Collection/PushShift Archives/StreamingRedditArchiveDecoder.swift @@ -0,0 +1,132 @@ +// Copyright (c) 2024 Jim Wallace +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +import Foundation +import ZSTD + + +/** + Implements a BufferedMemoryStream that can be used to pull out JSON objects on the fly as part of an Elva streaming decompression chain + */ +public class StreamingRedditArchiveDecoder<T: Decodable> { + + // Same functionality as Elva's BufferedMemoryStream + public private(set) var representation: Data = Data() + private var readerIndex: Int = 0 + + // Our variables: + private var filter: (T) -> Bool /// Determine whether we should keep each element, defaults to keep everything + public var memoryStore: [T]? = nil /// Where to put output if we want to keep it in memory + private var outputStream: WriteableStream? = nil /// Where to write output to if we just want to keep streaming it, e.g., possibly a `FileWriteStream` + public var errorStore: [Data] = [Data]() /// Save any errors so that we can fix decoding ... remove this later? + + // We'll use this to pull out objects + private let decoder = JSONDecoder() + + + /** + Initalizer that immediately decompresses the stream + */ + public init(from: ReadableStream, + to: WriteableStream, + filter: @escaping (T) -> Bool = { _ in true }, + config: ZSTD.DecompressConfig = ZSTD.DecompressConfig.default) + throws { + outputStream = to + self.filter = filter + try ZSTD.decompress(reader: from, writer: self, config: config) + } + + /** + Initalizer that immediately decompresses the stream and stores its items in an array + */ + public init(from: ReadableStream, + filter: @escaping (T) -> Bool = { _ in true }, + config: ZSTD.DecompressConfig = ZSTD.DecompressConfig.default + ) throws { + memoryStore = [T]() + self.filter = filter + try ZSTD.decompress(reader: from, writer: self, config: config) + } + + /** + Initalizer that immediately decompresses the file and stores its items in an array + */ + public init(path: String, + filter: @escaping (T) -> Bool = { _ in true }, + config: ZSTD.DecompressConfig = ZSTD.DecompressConfig.default + ) throws { + + let i = try? FileReadStream(path: path) + + guard let i = i else { + throw NSError(domain: "com.example.generic", code: 0, userInfo: [NSLocalizedDescriptionKey: "A generic error occurred."]) // TODO: Clean this up + } + + memoryStore = [T]() + self.filter = filter + try ZSTD.decompress(reader: i, writer: self, config: config) + } + +} + + +extension StreamingRedditArchiveDecoder: WriteableStream { + public func write(_ data: UnsafePointer<UInt8>, length: Int) -> Int { + representation += Data(bytes: data, count: length) + + // While we have newlines, work through the file + while let range = representation.range(of: Data([0x0A])) { + let chunk = representation.subdata(in: 0..<range.lowerBound + 1) + representation.removeSubrange(0..<range.lowerBound + 1) + + let item = try? decoder.decode(T.self, from: chunk) + + // If we found an error, save the data + if item == nil { + errorStore.insert(chunk, at: errorStore.count) + } + + if let item = item, filter(item) { + if memoryStore != nil { + memoryStore!.insert(item, at: memoryStore!.count) + } + if let output = outputStream { + let size = chunk.count + let buffer = UnsafeMutablePointer<UInt8>.allocate(capacity: size) + chunk.copyBytes(to: buffer, count: size) + let _ = output.write(buffer, length: size) + } + } + } + + return length + } + + public func close() { + if let out = outputStream { + out.close() + } + } + +} diff --git a/Sources/SwiftNLP/1. Data Collection/Pushshift Archives/loadFromRedditArchive.swift b/Sources/SwiftNLP/1. Data Collection/PushShift Archives/loadFromRedditArchive.swift similarity index 100% rename from Sources/SwiftNLP/1. Data Collection/Pushshift Archives/loadFromRedditArchive.swift rename to Sources/SwiftNLP/1. Data Collection/PushShift Archives/loadFromRedditArchive.swift diff --git a/Sources/SwiftNLP/1. Data Collection/Pushshift Archives/BufferedRedditArchiveDecoder.swift b/Sources/SwiftNLP/1. Data Collection/Pushshift Archives/BufferedRedditArchiveDecoder.swift deleted file mode 100644 index 41428c1d19f6ecf1d5e1723305cc8bfb8c2f6607..0000000000000000000000000000000000000000 --- a/Sources/SwiftNLP/1. Data Collection/Pushshift Archives/BufferedRedditArchiveDecoder.swift +++ /dev/null @@ -1,109 +0,0 @@ -// Copyright (c) 2024 Jim Wallace -// -// Permission is hereby granted, free of charge, to any person -// obtaining a copy of this software and associated documentation -// files (the "Software"), to deal in the Software without -// restriction, including without limitation the rights to use, -// copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the -// Software is furnished to do so, subject to the following -// conditions: -// -// The above copyright notice and this permission notice shall be -// included in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES -// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT -// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, -// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -// OTHER DEALINGS IN THE SOFTWARE. - -import Foundation -import ZSTD - - -/** - Implements a BufferedMemoryStream that can be used to pull out JSON objects on the fly as part of an Elva streaming decompression chain - */ -public class BufferedRedditArchiveDecoder<T: Decodable> { - - // Same functionality as Elva's BufferedMemoryStream - public private(set) var representation: Data = Data() - private var readerIndex: Int = 0 - - // Our variables: - private var filter: (T) -> Bool = { _ in true } /// Determine whether we should keep each element, defaults to keep everything - private var memoryStore: (any RangeReplaceableCollection<T>)? = nil /// Where to put output if we want to keep it in memory as `RedditDataItem` objects - private var outputStream: WriteableStream? = nil /// Where to write output to if we just want to keep streaming it, e.g., possibly a `FileWriteStream` - private var remainingItems: UInt = 0 /// How many items are left from the index? - private var processedItems = 0 - - // We'll use this to pull out objects - private let decoder = JSONDecoder() - - - // Constants - //private let leftBracket = Data([0x7B]) - //private let rightBracket = Data([0x7D]) - - public init(saveTo: WriteableStream, filter: @escaping (T) -> Bool = { _ in true }) { - outputStream = saveTo - self.filter = filter - } - - public init(storeIn: any RangeReplaceableCollection<T>, filter: @escaping (T) -> Bool = { _ in true }) { - memoryStore = storeIn - self.filter = filter - } -} - - -extension BufferedRedditArchiveDecoder: WriteableStream { - public func write(_ data: UnsafePointer<UInt8>, length: Int) -> Int { - representation += Data(bytes: data, count: length) - - // While we have newlines, work through the file - while let range = representation.range(of: Data([0x0A])) { - let extractedChunk = representation.subdata(in: 0..<range.lowerBound + 1) - representation.removeSubrange(0..<range.lowerBound + 1) - - // TODO: The big archive files start with a [ String : Int32 ] that we need to parse conditionally ... need to test with those files - if processedItems == 0 { - // TODO: We probably have an extra { at the beginning and end for these files? How do we handle that gracefully? - let index = try? decoder.decode([String: Int32].self, from: extractedChunk) - - if let index = index { - remainingItems = UInt(index.count) - } - } - - let item = try? decoder.decode(T.self, from: extractedChunk) - - if let item = item, filter(item) { - if var memory = memoryStore { - memory.append(item) - } - if let output = outputStream { - let size = extractedChunk.count - let buffer = UnsafeMutablePointer<UInt8>.allocate(capacity: size) - extractedChunk.copyBytes(to: buffer, count: size) - let _ = output.write(buffer, length: size) - } - } - processedItems += 1 - } - - return length - } - - public func close() { - if let out = outputStream { - out.close() - } - debugPrint("Processed \(processedItems) entries.") - } - -} diff --git a/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/Comment.swift b/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/Comment.swift index 3a9408f988559cb59248a7e63ff021f5f0ee489b..2a6df1d9389c3fc984ea5681280c4a91f985b9dc 100644 --- a/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/Comment.swift +++ b/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/Comment.swift @@ -45,4 +45,77 @@ public struct Comment: RedditDataItem { public let subreddit: String? public let subreddit_id: String? public let replies: String? + + enum CodingKeys: String, CodingKey { + case author = "author" + case author_created_utc = "author_created_utc" + case author_flair_css_class = "author_flair_css_class" + case author_flair_text = "author_flair_text" + case author_fullname = "author_fullname" + case body = "body" + case controversiality = "controversiality" + case created_utc = "created_utc" + case distinguished = "distinguished" + case gilded = "gilded" + case id = "id" + case link_id = "link_id" + case nest_level = "nest_level" + case parent_id = "parent_id" + case reply_delay = "reply_delay" + case retrieved_on = "retrieved_on" + case score = "score" + case score_hidden = "score_hidden" + case subreddit = "subreddit" + case subreddit_id = "subreddit_id" + case replies = "replies" + } } + + +extension Comment { + /** + Can we handle some of the messy data? + */ + public init(from decoder: Decoder) throws { + + let container = try decoder.container(keyedBy: CodingKeys.self) + + author = try container.decodeIfPresent(String.self, forKey: .author) + author_created_utc = try container.decodeIfPresent(Int32.self, forKey: .author_created_utc) + author_flair_css_class = try container.decodeIfPresent(String.self, forKey: .author_flair_css_class) + author_flair_text = try container.decodeIfPresent(String.self, forKey: .author_flair_text) + author_fullname = try container.decodeIfPresent(String.self, forKey: .author_fullname) + body = try container.decodeIfPresent(String.self, forKey: .body) + controversiality = try container.decodeIfPresent(Int32.self, forKey: .controversiality) + distinguished = try container.decodeIfPresent(String.self, forKey: .distinguished) + gilded = try container.decodeIfPresent(Int32.self, forKey: .gilded) + id = try container.decodeIfPresent(String.self, forKey: .id) + link_id = try container.decodeIfPresent(String.self, forKey: .link_id) + nest_level = try container.decodeIfPresent(Int32.self, forKey: .nest_level) + parent_id = try container.decodeIfPresent(String.self, forKey: .parent_id) + reply_delay = try container.decodeIfPresent(Int32.self, forKey: .reply_delay) + retrieved_on = try container.decodeIfPresent(Int32.self, forKey: .retrieved_on) + score = try container.decodeIfPresent(Int32.self, forKey: .score) + score_hidden = try container.decodeIfPresent(Bool.self, forKey: .score_hidden) + subreddit = try container.decodeIfPresent(String.self, forKey: .subreddit) + subreddit_id = try container.decodeIfPresent(String.self, forKey: .subreddit_id) + replies = try container.decodeIfPresent(String.self, forKey: .replies) + + + // Custom decoding for 'created_utc' + //created_utc = try container.decodeIfPresent(Int32.self, forKey: .created_utc) + if let createdValue = try? container.decode(Int32.self, forKey: .created_utc) { + created_utc = createdValue + } else if let createdString = try? container.decode(String.self, forKey: .created_utc), + let createdValue = Int32(createdString) { + created_utc = createdValue + } else { + throw DecodingError.typeMismatch( + Int32.self, + DecodingError.Context(codingPath: [CodingKeys.id], debugDescription: "Expected to decode Int32 or String for created_utc") + ) + } + + } +} + diff --git a/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/Listing + Codable.swift b/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/Listing + Codable.swift index 259163eb577ec427b9648df68bcf5b6b1cfb4044..67381ce47aff7174e851ab2a9fd35469e89c3848 100644 --- a/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/Listing + Codable.swift +++ b/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/Listing + Codable.swift @@ -31,7 +31,7 @@ extension ListingDataItem { case .more: //debugPrint("FOUND MORE") - data = try container.decode(RessidtListingMore.self, forKey: .data) + data = try container.decode(RedditListingMore.self, forKey: .data) default: throw SessionError(message: "Unknown type of Reddit content from JSON.") diff --git a/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/MoreComments.swift b/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/MoreComments.swift index b8eb04b382b9d37210e84f19113ebf96f9051706..1611c7c56abf1584a37d4f60605073fb66cdd8da 100644 --- a/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/MoreComments.swift +++ b/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/MoreComments.swift @@ -21,7 +21,7 @@ // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR // OTHER DEALINGS IN THE SOFTWARE. -struct RessidtListingMore: RedditDataItem { +struct RedditListingMore: RedditDataItem { //let name: String? //let kind: String? @@ -32,6 +32,7 @@ struct RessidtListingMore: RedditDataItem { var id: String? { return nil } var created_utc: Int32? { return nil } // TODO: This is a hack that allows conformance to RedditDataItem ... fix later? + var subreddit: String? { return nil } // TODO: This is a hack that allows conformance to RedditDataItem ... fix later? } struct MoreContainer: Codable { diff --git a/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/RedditDataItem.swift b/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/RedditDataItem.swift index 182162d2c9da1d8a1b13d438a5d18a02f02c51ad..e7b2f01f201ae5ebb24f6f4c5a1d568c14506a90 100644 --- a/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/RedditDataItem.swift +++ b/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/RedditDataItem.swift @@ -24,4 +24,6 @@ public protocol RedditDataItem: Codable, Equatable { var created_utc: Int32? { get } var id: String? { get } + var subreddit: String? { get } + // TODO: What are the common fields, and how many can we expose through this protocol? } diff --git a/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/Subreddit.swift b/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/Subreddit.swift index f67f08402a6ee16ffff21e2fa7ac0e9d8c3e5664..7004192134618779bb56cb546f0e6873f7f86bef 100644 --- a/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/Subreddit.swift +++ b/Sources/SwiftNLP/1. Data Collection/Reddit API/Data Types/Subreddit.swift @@ -23,7 +23,7 @@ // TODO: Add conformance to RedditDataItem public struct Subreddit: RedditDataItem { - + public var id: String? public let submitText: String? public let displayName: String? @@ -44,6 +44,8 @@ public struct Subreddit: RedditDataItem { public let publicDescription: String? public var created_utc: Int32? { Int32(createdUTC) } + public var subreddit: String? { displayName } + diff --git a/Sources/SwiftNLP/1. Data Collection/Reddit API/Network Endpoints/Session + MoreChildren.swift b/Sources/SwiftNLP/1. Data Collection/Reddit API/Network Endpoints/Session + MoreChildren.swift index b6f27c9063ffcb4b095a3fb978c248974065aab0..b52e88d19020e69b40427148769bac6785b4a5d9 100644 --- a/Sources/SwiftNLP/1. Data Collection/Reddit API/Network Endpoints/Session + MoreChildren.swift +++ b/Sources/SwiftNLP/1. Data Collection/Reddit API/Network Endpoints/Session + MoreChildren.swift @@ -77,7 +77,7 @@ extension Session { comments.append(child.data as! Comment) } if child.kind == .more { - let moreItems = child.data as! RessidtListingMore + let moreItems = child.data as! RedditListingMore more.append(contentsOf: moreItems.children) } } diff --git a/Sources/SwiftNLP/1. Data Collection/Reddit API/Session.swift b/Sources/SwiftNLP/1. Data Collection/Reddit API/Session.swift index 045e643a5b838160f20319b9e0484ecc9d8a9b58..a0244507e13032777856cc18e380f8bd84d75478 100644 --- a/Sources/SwiftNLP/1. Data Collection/Reddit API/Session.swift +++ b/Sources/SwiftNLP/1. Data Collection/Reddit API/Session.swift @@ -119,7 +119,7 @@ extension Session { comments.append(child.data as! Comment) } if child.kind == .more { - let moreItems = child.data as! RessidtListingMore + let moreItems = child.data as! RedditListingMore more.append(contentsOf: moreItems.children) } } diff --git a/Tests/SwiftNLPTests/1. Data Collection/CWorld.AI/Load 20k Tests.swift b/Tests/SwiftNLPTests/1. Data Collection/CWorld.AI/Load 20k Tests.swift index d33ccfb74d5fc510ec46483f9113cd0a92925461..4a37465f348cf5fa4f021be7552fdde88bc0bf9c 100644 --- a/Tests/SwiftNLPTests/1. Data Collection/CWorld.AI/Load 20k Tests.swift +++ b/Tests/SwiftNLPTests/1. Data Collection/CWorld.AI/Load 20k Tests.swift @@ -1,29 +1,46 @@ +// Copyright (c) 2024 Jim Wallace // -// File.swift -// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: // -// Created by Jason Zhao on 2023-05-18. +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. // -import XCTest -@testable import SwiftNLP +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. -final class SwiftNLPLoadDataTests: XCTestCase { - - func test20kDownload() async throws { - - let result = try await downloadSubredditFromServer(subreddit: "StopGaming") - //print("Loaded \(result.count) threads from server.") - //if let random = result.randomElement() { - // let (key, value) = random - //print("Key: \(key), Value: \(value)") - // } - XCTAssertEqual(result.count, 38624, "Failed to load subreddit data from https://reddit-top20k.cworld.ai") - - var total = 0 - for thread in result { - total += thread.value.comments.count - //print("Comments: \(thread.value.comments.count)") - } - XCTAssertEqual(total, 255294, "Failed to load comment data from https://reddit-top20k.cworld.ai") - } -} +//import XCTest +//@testable import SwiftNLP +// +//final class SwiftNLPLoadDataTests: XCTestCase { +// +// func test20kDownload() async throws { +// +// let result = try await downloadSubredditFromServer(subreddit: "StopGaming") +// //print("Loaded \(result.count) threads from server.") +// //if let random = result.randomElement() { +// // let (key, value) = random +// //print("Key: \(key), Value: \(value)") +// // } +// XCTAssertEqual(result.count, 38624, "Failed to load subreddit data from https://reddit-top20k.cworld.ai") +// +// var total = 0 +// for thread in result { +// total += thread.value.comments.count +// //print("Comments: \(thread.value.comments.count)") +// } +// XCTAssertEqual(total, 255294, "Failed to load comment data from https://reddit-top20k.cworld.ai") +// } +//} diff --git a/Tests/SwiftNLPTests/1. Data Collection/PushShift Archives/PushShift Tests.swift b/Tests/SwiftNLPTests/1. Data Collection/PushShift Archives/PushShift Tests.swift new file mode 100644 index 0000000000000000000000000000000000000000..096734314b9a1d2458b0e48f2ca9aa2fd39f7861 --- /dev/null +++ b/Tests/SwiftNLPTests/1. Data Collection/PushShift Archives/PushShift Tests.swift @@ -0,0 +1,129 @@ +// Copyright (c) 2024 Jim Wallace +// +// Permission is hereby granted, free of charge, to any person +// obtaining a copy of this software and associated documentation +// files (the "Software"), to deal in the Software without +// restriction, including without limitation the rights to use, +// copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the +// Software is furnished to do so, subject to the following +// conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +// OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +// HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +// WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +// FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +// OTHER DEALINGS IN THE SOFTWARE. + +import XCTest +@testable import SwiftNLP +import ZSTD + +final class PushShiftStreamingTests: XCTestCase { + + + func testNoIndex() async throws { + + guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else { + fatalError("Failed to find waterloo_submissions.zst in test bundle.") + } + + let input = try FileReadStream(path: submissionsURL.path) + let stream = try StreamingRedditArchiveDecoder<Submission>(from: input) + + XCTAssert(stream.memoryStore!.count == 17999) + XCTAssert(stream.errorStore.count == 0) + } + + func testNoIndexBig() async throws { + + guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else { + fatalError("Failed to find waterloo_submissions.zst in test bundle.") + } + guard let commentsURL = Bundle.module.url(forResource: "Guelph_comments", withExtension: "zst") else { + fatalError("Failed to find waterloo_submissions.zst in test bundle.") + } + + let sInput = try FileReadStream(path: submissionsURL.path) + let sStream = try StreamingRedditArchiveDecoder<Submission>(from: sInput) + + let cInput = try FileReadStream(path: commentsURL.path) + let cStream = try StreamingRedditArchiveDecoder<Comment>(from: cInput) + + XCTAssert(sStream.memoryStore!.count == 17999) + XCTAssert(sStream.errorStore.count == 0) + XCTAssert(cStream.memoryStore!.count == 160104) + XCTAssert(cStream.errorStore.count == 0) + } + + func testIndex() async throws { + + guard let submissionsURL = Bundle.module.url(forResource: "RS_2006-01", withExtension: "zst") else { + fatalError("Failed to find waterloo_submissions.zst in test bundle.") + } + + + let input = try FileReadStream(path: submissionsURL.path) + let config = ZSTD.DecompressConfig(parameters: [.windowLogMax(31)]) // These files require a bigger window size + let stream = try StreamingRedditArchiveDecoder<Submission>(from: input, config: config) + + XCTAssert(stream.memoryStore!.count == 8048) + XCTAssert(stream.errorStore.count == 0) + + } + + func testIndexBig() async throws { + + guard let submissionsURL = Bundle.module.url(forResource: "RS_2006-01", withExtension: "zst") else { + fatalError("Failed to find waterloo_submissions.zst in test bundle.") + } + guard let commentsURL = Bundle.module.url(forResource: "RC_2006-01", withExtension: "zst") else { + fatalError("Failed to find waterloo_submissions.zst in test bundle.") + } + + + let sInput = try FileReadStream(path: submissionsURL.path) + let config = ZSTD.DecompressConfig(parameters: [.windowLogMax(31)]) // These files require a bigger window size + let sStream = try StreamingRedditArchiveDecoder<Submission>(from: sInput, config: config) + + + let cInput = try FileReadStream(path: commentsURL.path) + let cStream = try StreamingRedditArchiveDecoder<Comment>(from: cInput, config: config) + + XCTAssert(sStream.memoryStore!.count == 8048) + XCTAssert(sStream.errorStore.count == 0) + XCTAssert(cStream.memoryStore!.count == 3666) + XCTAssert(cStream.errorStore.count == 0) + } + + func testFilter() async throws { + + guard let submissionsURL = Bundle.module.url(forResource: "Guelph_submissions", withExtension: "zst") else { + fatalError("Failed to find waterloo_submissions.zst in test bundle.") + } + + + let input = try FileReadStream(path: submissionsURL.path) + + let filter: (Submission) -> Bool = { value in + if let selftext = value.selftext { + if selftext.contains("university") { + return true + } + } + return false + } + + let stream = try StreamingRedditArchiveDecoder(from: input, filter: filter) + + XCTAssert(stream.memoryStore!.count == 139) + XCTAssert(stream.errorStore.count == 0) + } + +} diff --git a/Tests/SwiftNLPTests/Resources/RC_2006-01.zst b/Tests/SwiftNLPTests/Resources/RC_2006-01.zst new file mode 100644 index 0000000000000000000000000000000000000000..fd0a54464b688cb7b09454284695ee749bc9ac28 Binary files /dev/null and b/Tests/SwiftNLPTests/Resources/RC_2006-01.zst differ diff --git a/Tests/SwiftNLPTests/Resources/RS_2006-01.zst b/Tests/SwiftNLPTests/Resources/RS_2006-01.zst new file mode 100644 index 0000000000000000000000000000000000000000..c87fdea601108b10b3a642e0055679de77572629 Binary files /dev/null and b/Tests/SwiftNLPTests/Resources/RS_2006-01.zst differ diff --git a/Tests/SwiftNLPTests/TestUtil/LoadTestJson.swift b/Tests/SwiftNLPTests/TestUtil/LoadTestJson.swift deleted file mode 100644 index 770d52de2b72400a0fde878c1ef3454781109fef..0000000000000000000000000000000000000000 --- a/Tests/SwiftNLPTests/TestUtil/LoadTestJson.swift +++ /dev/null @@ -1,95 +0,0 @@ -// -// LoadTestJson.swift -// -// -// Created by Jason Zhao on 2023-05-18. -// - -import Foundation -import SwiftNLP - -// the goal of this util is to fetch and load json files into the Data class -// which can then be used directly by the package to convert into objects -// right now it takes test json from the Resources folder in Test -// however, eventually we would like to fetch the data from a cloud server -// With modularity, we can simply change the logic of this Util to grab from cloud, and return the same data structure -class TestUtils { - // loads a json data given file name - static func loadJsonData(file: String) -> Data { - let bundle = Bundle.module - guard let url = bundle.url(forResource: file, withExtension: "json") else { - fatalError("Failed to find \(file).json in test bundle.") - } - guard let data = try? Data(contentsOf: url) else { - fatalError("Failed to load \(file).json from test bundle.") - } - return data - } - - // returns all json files with the specific prefix - // right now we distinguish with "RC" and "RS" - static func getJsonFiles(prefix: String) -> [String] { - let fileManager = FileManager.default - guard let resourcePath = Bundle.module.resourcePath else { - print("Failed to find resource path") - return [] - } - - do { - let files = try fileManager.contentsOfDirectory(atPath: resourcePath) - let jsonFiles = files.filter{$0.hasPrefix(prefix) && $0.hasSuffix(".json")} - - var fileNames: [String] = [] - - for jsonFile in jsonFiles { - let fileName = (jsonFile as NSString).deletingPathExtension - fileNames.append(fileName) - } - return fileNames - } catch let error as NSError { - print("Error while getting files : \(error)") - return [] - } - } - - // load all the json files with the prefix - static func loadAllFiles(prefix: String) -> [Data] { - let allJsonFiles = TestUtils.getJsonFiles(prefix: prefix) - var result: [Data] = [] - for fileName in allJsonFiles { - let json = TestUtils.loadJsonData(file: fileName) - result.append(json) - } - return result - } - - // loads all reddit comments files into json Data - static func loadAllRedditComment() -> [Data] { - return loadAllFiles(prefix: "RC") - } - - // loads all reddit submissions files into json Data - static func loadAllRedditSubmission() -> [Data] { - return loadAllFiles(prefix: "RS") - } - - static func readRedditCommentJson(json: Data) -> RedditCommentData? { - do { - let commentData = try JSONDecoder().decode(RedditCommentData.self, from: json) - return commentData - } catch { - print("Error while decoding reddit comment file: \(error)") - return nil - } - } - - static func readRedditSubmissionJson(json: Data) -> RedditSubmissionData? { - do { - let submissionData = try JSONDecoder().decode(RedditSubmissionData.self, from: json) - return submissionData - } catch { - print("Error while decoding reddit submission file: \(error)") - return nil - } - } -}