From 1e56c0b4d321e9d0183674be9b8c4999e6238f4e Mon Sep 17 00:00:00 2001 From: Anees Iqbal Date: Wed, 20 May 2026 17:35:30 +0200 Subject: [PATCH 1/3] :new: Make video stream optional in RTSPClientSession MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A session is now considered valid when any of video, audio, or analytics metadata is set up, so audio-only and metadata-only RTSP configurations (e.g. Axis cameras with `video=0`) work end-to-end. Breaking: SessionDescription replaces the flat video and audio fields with `video: VideoStream?` and `audio: AudioStream?` substructs. One Optional check unlocks all the codec-and-parameter fields together — removes the prior anti-pattern of multiple parallel Optionals sharing the same "all-set-or-all-nil" invariant. ```swift struct SessionDescription { let video: VideoStream? // codec, clockRate, sps, pps, vps, resolution let audio: AudioStream? // codec, sampleRate, channels, extraData let metadataEncoding: String? } ``` Internally, the startup path mirrors the audio/metadata best-effort pattern: video find is optional, video SETUP failure is still fatal (matching audio), and two gates flank PLAY: - A pre-PLAY gate throws if no supported video/audio/metadata stream was set up. The error message enumerates what was offered to make misconfigurations diagnosable. - A post-init gate catches the case where SETUP succeeded but every depacketizer/timeline init failed (e.g. malformed AAC fmtp, broken metadata clock rate). Required so the documented "at least one usable stream" invariant holds at the return site too. The audio depacketizer init path now also nullifies its stream state on failure, for symmetry with the metadata path — keeps the post-init gate honest. Encoding-support predicates (`isVideoEncodingSupported`, `isAudioEncodingSupported`, `isApplicationEncodingSupported`) are extracted to module-level free functions so tests can exercise them directly. --- Examples/CameraViewer/main.swift | 33 ++- Sources/IPCamKit/Client/RTSPSession.swift | 337 ++++++++++++++-------- 2 files changed, 235 insertions(+), 135 deletions(-) diff --git a/Examples/CameraViewer/main.swift b/Examples/CameraViewer/main.swift index cff1c25..7db579c 100644 --- a/Examples/CameraViewer/main.swift +++ b/Examples/CameraViewer/main.swift @@ -282,23 +282,25 @@ final class CameraViewerDelegate: NSObject, NSApplicationDelegate { do { let desc = try await session.start() - let res = desc.resolution.map { "\($0.width)×\($0.height)" } ?? "?" - log("Connected: \(desc.videoCodec) \(res)") + let res = desc.video?.resolution.map { "\($0.width)×\($0.height)" } ?? "?" + let codecLabel = desc.video.map { "\($0.codec)" } ?? "no video" + log("Connected: \(codecLabel) \(res)") await MainActor.run { - window.title = "IPCamKit — \(desc.videoCodec) \(res)" + window.title = "IPCamKit — \(codecLabel) \(res)" } - if let audioCodec = desc.audioCodec, let audioRate = desc.audioSampleRate { + if let audio = desc.audio { audioPlayer.start( - codec: audioCodec, sampleRate: Double(audioRate), - channels: UInt32(desc.audioChannels ?? 1)) + codec: audio.codec, sampleRate: Double(audio.sampleRate), + channels: UInt32(audio.channels ?? 1)) } - var fmtDesc = try makeFormatDescription( - codec: desc.videoCodec, - sps: desc.sps, pps: desc.pps, vps: desc.vps - ) + var fmtDesc: CMVideoFormatDescription? + if let video = desc.video, let sps = video.sps, let pps = video.pps { + fmtDesc = try makeFormatDescription( + codec: video.codec, sps: sps, pps: pps, vps: video.vps) + } let layer = layerRef.layer var receivedKeyframe = false @@ -311,9 +313,10 @@ final class CameraViewerDelegate: NSObject, NSApplicationDelegate { audioPlayer.enqueue(audioFrame) case .video(let frame): + guard let video = desc.video else { continue } if let newSPS = frame.sps, let newPPS = frame.pps { fmtDesc = try makeFormatDescription( - codec: desc.videoCodec, + codec: video.codec, sps: newSPS, pps: newPPS, vps: frame.vps ) } @@ -323,9 +326,11 @@ final class CameraViewerDelegate: NSObject, NSApplicationDelegate { receivedKeyframe = true } - if let sample = buildSampleBuffer( - frame, codec: desc.videoCodec, formatDescription: fmtDesc - ) { + if let fmt = fmtDesc, + let sample = buildSampleBuffer( + frame, codec: video.codec, formatDescription: fmt + ) + { if layer.status == .failed { layer.flush() } layer.enqueue(sample) } diff --git a/Sources/IPCamKit/Client/RTSPSession.swift b/Sources/IPCamKit/Client/RTSPSession.swift index eb9c251..cdccdf0 100644 --- a/Sources/IPCamKit/Client/RTSPSession.swift +++ b/Sources/IPCamKit/Client/RTSPSession.swift @@ -50,24 +50,41 @@ public struct RTSPDiagnostic: Sendable { } } -/// Parsed session description returned from `start()`. -public struct SessionDescription: Sendable { - public let videoCodec: VideoCodec - public let sps: Data - public let pps: Data - /// VPS data (H.265 only, nil for H.264). +/// Video stream details, surfaced when a supported video stream is active. +/// +/// `codec` and `clockRate` are always populated. `sps`/`pps`/`vps`/`resolution` +/// are `nil` until the depacketizer has observed parameter sets — for most +/// cameras this happens in-band on the first packet; cameras that ship the +/// parameter sets in SDP `fmtp` will have them set at `start()` time. +public struct VideoStream: Sendable { + public let codec: VideoCodec + public let clockRate: UInt32 + public let sps: Data? + public let pps: Data? + /// H.265 only; nil for H.264. public let vps: Data? public let resolution: (width: Int, height: Int)? - public let clockRate: UInt32 +} - /// Audio codec, if an audio stream was found. - public let audioCodec: PublicAudioCodec? - /// Audio sample rate in Hz, if an audio stream was found. - public let audioSampleRate: UInt32? - /// Audio channel count, if known. - public let audioChannels: UInt16? +/// Audio stream details, surfaced when a supported audio stream is active. +/// +/// `codec` and `sampleRate` are always populated. `channels` and `extraData` +/// reflect what the camera advertised; they are codec-dependent. +public struct AudioStream: Sendable { + public let codec: PublicAudioCodec + public let sampleRate: UInt32 + public let channels: UInt16? /// Codec-specific extra data (e.g. AudioSpecificConfig for AAC). - public let audioExtraData: Data? + public let extraData: Data? +} + +/// Parsed session description returned from `start()`. +/// +/// At least one of `video`, `audio`, or `metadataEncoding` is non-`nil` — +/// a session with zero usable streams is rejected at `start()`. +public struct SessionDescription: Sendable { + public let video: VideoStream? + public let audio: AudioStream? /// SDP encoding name of the analytics-metadata stream if one was set up /// (e.g. `vnd.onvif.metadata`), or `nil` if no metadata stream is active. @@ -316,41 +333,42 @@ actor SessionState { var presMut = try parseDescribe(requestURL: url, response: describeResp) presentation = presMut - // Find first H.264 or H.265 video stream - guard - let videoIdx = presMut.streams.firstIndex(where: { - $0.media == "video" && ($0.encodingName == "h264" || $0.encodingName == "h265") - }) - else { - throw RTSPError.sessionSetupFailed( - statusCode: 0, reason: "No H.264/H.265 video stream found") - } - - let stream = presMut.streams[videoIdx] self.url = url - self.videoStreamIndex = videoIdx - - // SETUP - let setupURL = stream.control ?? url - var setupHeaders: [(String, String)] = [] - if transport == .tcp { - let channelId = channelMappings.nextUnassigned() ?? 0 - setupHeaders.append( - ( - "Transport", - "RTP/AVP/TCP;unicast;interleaved=\(channelId)-\(channelId + 1)" - )) - try channelMappings.assign(channelId: channelId, streamIndex: videoIdx) - } else { - setupHeaders.append(("Transport", "RTP/AVP;unicast")) - } - let setupResp = try await sendRequest( - method: .setup, url: setupURL, extraHeaders: setupHeaders) - let setup = try parseSetup(response: setupResp) - sessionId = setup.session.id - presMut.streams[videoIdx].state = .setup( - StreamStateInit(ssrc: setup.ssrc, initialSeq: nil, initialRtptime: nil, ctx: .dummy)) + // Find first H.264 or H.265 video stream — optional. Cameras can be + // configured to expose audio-only or metadata-only RTSP sessions + // (e.g. Axis with `video=0`), in which case we proceed without video + // and require at least one of audio/metadata to be set up. + let videoIdx = presMut.streams.firstIndex(where: { + $0.media == "video" && isVideoEncodingSupported($0.encodingName) + }) + var videoSetupSSRC: UInt32? + + if let videoIdx = videoIdx { + let stream = presMut.streams[videoIdx] + let setupURL = stream.control ?? url + var setupHeaders: [(String, String)] = [] + if transport == .tcp { + let channelId = channelMappings.nextUnassigned() ?? 0 + setupHeaders.append( + ( + "Transport", + "RTP/AVP/TCP;unicast;interleaved=\(channelId)-\(channelId + 1)" + )) + try channelMappings.assign(channelId: channelId, streamIndex: videoIdx) + } else { + setupHeaders.append(("Transport", "RTP/AVP;unicast")) + } + + let setupResp = try await sendRequest( + method: .setup, url: setupURL, extraHeaders: setupHeaders) + let setup = try parseSetup(response: setupResp) + sessionId = setup.session.id + videoSetupSSRC = setup.ssrc + presMut.streams[videoIdx].state = .setup( + StreamStateInit(ssrc: setup.ssrc, initialSeq: nil, initialRtptime: nil, ctx: .dummy)) + self.videoStreamIndex = videoIdx + } // Find and SETUP audio stream (optional, best-effort) let audioIdx = presMut.streams.firstIndex(where: { s in @@ -462,6 +480,20 @@ actor SessionState { } } + // Require at least one usable stream — a session with no video, audio, + // or metadata is degenerate (DESCRIBE succeeded but nothing is carrying + // payload), and sending PLAY would just open the door to packets we + // can't route. + if videoStreamIndex == nil && audioStreamIndex == nil && applicationStreamIndex == nil { + let offered = presMut.streams.map { "\($0.media)/\($0.encodingName)" } + .joined(separator: ", ") + throw RTSPError.sessionSetupFailed( + statusCode: 0, + reason: + "No supported video, audio, or metadata stream was set up " + + "(offered: \(offered.isEmpty ? "" : offered)).") + } + // PLAY var playHeaders: [(String, String)] = [] if let sid = sessionId { @@ -475,36 +507,42 @@ actor SessionState { try parsePlay(response: playResp, presentation: &presMut) presentation = presMut - // Initialize video depacketizer - if stream.encodingName == "h265" { - depacketizer = .h265( - try H265Depacketizer( - clockRate: stream.clockRateHz, - formatSpecificParams: stream.formatSpecificParams)) - } else { - depacketizer = .h264( - try H264Depacketizer( - clockRate: stream.clockRateHz, - formatSpecificParams: stream.formatSpecificParams)) - } + // Initialize video depacketizer + timeline + inorder parser. Conditional + // on a successful video SETUP — when no video stream was set up, + // `videoStreamIndex` is nil and we skip the entire video pipeline. + var videoClockRate: UInt32? + if let videoIdx = videoStreamIndex { + let stream = presMut.streams[videoIdx] + if stream.encodingName == "h265" { + depacketizer = .h265( + try H265Depacketizer( + clockRate: stream.clockRateHz, + formatSpecificParams: stream.formatSpecificParams)) + } else { + depacketizer = .h264( + try H264Depacketizer( + clockRate: stream.clockRateHz, + formatSpecificParams: stream.formatSpecificParams)) + } + videoClockRate = stream.clockRateHz - // Initialize video timeline and inorder parser - var videoStart: UInt32? - var videoSeq: UInt16? - var videoSsrc: UInt32? = setup.ssrc + var videoStart: UInt32? + var videoSeq: UInt16? + var videoSsrc: UInt32? = videoSetupSSRC - if case .setup(let init_) = presMut.streams[videoIdx].state { - videoStart = init_.initialRtptime - if let seq = init_.initialSeq, seq != 0, seq != 1 { - videoSeq = seq + if case .setup(let init_) = presMut.streams[videoIdx].state { + videoStart = init_.initialRtptime + if let seq = init_.initialSeq, seq != 0, seq != 1 { + videoSeq = seq + } + if let s = init_.ssrc { videoSsrc = s } } - if let s = init_.ssrc { videoSsrc = s } - } - let timeline = try Timeline(start: videoStart, clockRate: stream.clockRateHz) - inorderParsers[videoIdx] = InorderParser( - ssrc: videoSsrc, nextSeq: videoSeq, isTcp: transport == .tcp, - timeline: timeline, onDiagnostic: onDiagnostic) + let timeline = try Timeline(start: videoStart, clockRate: stream.clockRateHz) + inorderParsers[videoIdx] = InorderParser( + ssrc: videoSsrc, nextSeq: videoSeq, isTcp: transport == .tcp, + timeline: timeline, onDiagnostic: onDiagnostic) + } // Initialize audio depacketizer and inorder parser var resolvedAudioCodec: PublicAudioCodec? @@ -543,6 +581,22 @@ actor SessionState { from: audioStream.encodingName) resolvedAudioRate = audioStream.clockRateHz resolvedAudioChannels = audioStream.channels + } else { + // Audio SETUP succeeded but the depacketizer rejected the format + // (e.g. malformed AAC fmtp). Null the audio state so packets on + // that interleaved channel are silently dropped instead of + // misrouted, and so `SessionDescription.audio` is `nil` (matching + // reality). Mirrors the metadata-init failure path below. + onDiagnostic?( + RTSPDiagnostic( + severity: .warning, + message: + "Audio depacketizer init failed for " + + "\(audioStream.encodingName); audio will not be delivered.")) + audioStreamIndex = nil + audioEncodingName = nil + audioClockRate = nil + audioChannels = nil } } @@ -584,46 +638,70 @@ actor SessionState { } } + // Post-init gate. The pre-PLAY gate above caught the case where no + // stream was supported, but audio and metadata depacketizer init run + // *after* PLAY and can null their own indices on failure (malformed + // AAC fmtp, broken Timeline clock rate, etc.). Re-check here so the + // documented "at least one usable stream" invariant holds at the + // return site too. PLAY has already been sent; the caller will tear + // down via `stop()` after we throw. + if videoStreamIndex == nil && audioStreamIndex == nil && applicationStreamIndex == nil { + throw RTSPError.sessionSetupFailed( + statusCode: 0, + reason: + "All stream depacketizers failed to initialize after PLAY; " + + "session has no usable streams.") + } + isPlaying = true - // Build session description - let isH265 = stream.encodingName == "h265" - let sps: Data - let pps: Data - var vps: Data? - let dims: (width: UInt16, height: UInt16)? - if let depkt = depacketizer { + // Build session description. `video` is nil when no video stream was + // set up; consumers branch on `desc.video != nil` to discover availability. + // `depacketizer` and `videoClockRate` are set in lock-step inside the + // video-init block above, so the outer `if let` enforces that invariant. + let video: VideoStream? + if let depkt = depacketizer, let clockRate = videoClockRate { switch depkt { case .h264(let d): - sps = d.parameters?.spsNAL ?? Data() - pps = d.parameters?.ppsNAL ?? Data() - dims = d.parameters?.genericParameters.pixelDimensions + let dims = d.parameters?.genericParameters.pixelDimensions + video = VideoStream( + codec: .h264, + clockRate: clockRate, + sps: d.parameters?.spsNAL, + pps: d.parameters?.ppsNAL, + vps: nil, + resolution: dims.map { (width: Int($0.width), height: Int($0.height)) } + ) case .h265(let d): - sps = d.parameters?.spsNAL ?? Data() - pps = d.parameters?.ppsNAL ?? Data() - vps = d.parameters?.vpsNAL - dims = d.parameters?.genericParameters.pixelDimensions + let dims = d.parameters?.genericParameters.pixelDimensions + video = VideoStream( + codec: .h265, + clockRate: clockRate, + sps: d.parameters?.spsNAL, + pps: d.parameters?.ppsNAL, + vps: d.parameters?.vpsNAL, + resolution: dims.map { (width: Int($0.width), height: Int($0.height)) } + ) } } else { - sps = Data() - pps = Data() - dims = nil + video = nil } - let resolution = dims.map { - (width: Int($0.width), height: Int($0.height)) + + let audio: AudioStream? + if let codec = resolvedAudioCodec, let rate = resolvedAudioRate { + audio = AudioStream( + codec: codec, + sampleRate: rate, + channels: resolvedAudioChannels, + extraData: audioDepacketizer?.audioParameters?.extraData + ) + } else { + audio = nil } return SessionDescription( - videoCodec: isH265 ? .h265 : .h264, - sps: sps, - pps: pps, - vps: vps, - resolution: resolution, - clockRate: stream.clockRateHz, - audioCodec: resolvedAudioCodec, - audioSampleRate: resolvedAudioRate, - audioChannels: resolvedAudioChannels, - audioExtraData: audioDepacketizer?.audioParameters?.extraData, + video: video, + audio: audio, metadataEncoding: applicationEncodingName ) } @@ -878,25 +956,6 @@ actor SessionState { ) } - private func isAudioEncodingSupported(_ name: String) -> Bool { - switch name { - case "mpeg4-generic", "pcmu", "pcma", "l16", "g722", "g723", - "u8", "dvi4", "g726-16", "g726-24", "g726-32", "g726-40": - return true - default: - return false - } - } - - private func isApplicationEncodingSupported(_ name: String) -> Bool { - switch name { - case "vnd.onvif.metadata": - return true - default: - return false - } - } - private func publicAudioCodec(from encoding: String) -> PublicAudioCodec { switch encoding { case "mpeg4-generic": return .aac @@ -909,3 +968,39 @@ actor SessionState { } } } + +// MARK: - Encoding-support predicates (free functions, testable) + +/// True iff `RTSPClientSession` can depacketize a video stream advertising +/// this SDP `a=rtpmap` encoding name. +func isVideoEncodingSupported(_ name: String) -> Bool { + switch name { + case "h264", "h265": + return true + default: + return false + } +} + +/// True iff `RTSPClientSession` can depacketize an audio stream advertising +/// this SDP `a=rtpmap` encoding name. +func isAudioEncodingSupported(_ name: String) -> Bool { + switch name { + case "mpeg4-generic", "pcmu", "pcma", "l16", "g722", "g723", + "u8", "dvi4", "g726-16", "g726-24", "g726-32", "g726-40": + return true + default: + return false + } +} + +/// True iff `RTSPClientSession` can depacketize an analytics-metadata stream +/// advertising this SDP `a=rtpmap` encoding name. +func isApplicationEncodingSupported(_ name: String) -> Bool { + switch name { + case "vnd.onvif.metadata": + return true + default: + return false + } +} From 81265a25d5aa61965ff73ff7325ca34a82cca015 Mon Sep 17 00:00:00 2001 From: Anees Iqbal Date: Wed, 20 May 2026 17:35:45 +0200 Subject: [PATCH 2/3] :white_check_mark: Add SDP fixtures and parser tests for video-less cases Four new SDP fixtures: three Axis-style configurations (audio-only, metadata-only, audio + metadata) modelled on `video=0` query strings, and one all-unsupported fixture (JPEG video, Opus audio, vendor- specific metadata) used to exercise the encoding-support filters. Parser tests cover each fixture's stream layout, including the negative invariants (no video, etc.) the optional-video refactor enables. Additional tests cover the encoding-support predicates directly and the "would the gate fire?" filter the predicates feed: the all-unsupported fixture produces nil for all three slots, while the Axis fixtures produce exactly the slots their SDPs advertise. Fixtures now include `a=recvonly` per media section, matching the ONVIF Streaming Specification example and the existing camera SDP fixtures in this directory. --- Tests/IPCamKitTests/DescribeParserTests.swift | 126 ++++++++++++++++++ .../TestData/axis_audio_metadata_sdp.txt | 20 +++ .../TestData/axis_audio_only_sdp.txt | 15 +++ .../TestData/axis_metadata_only_sdp.txt | 14 ++ .../TestData/unsupported_encodings_sdp.txt | 24 ++++ 5 files changed, 199 insertions(+) create mode 100644 Tests/IPCamKitTests/TestData/axis_audio_metadata_sdp.txt create mode 100644 Tests/IPCamKitTests/TestData/axis_audio_only_sdp.txt create mode 100644 Tests/IPCamKitTests/TestData/axis_metadata_only_sdp.txt create mode 100644 Tests/IPCamKitTests/TestData/unsupported_encodings_sdp.txt diff --git a/Tests/IPCamKitTests/DescribeParserTests.swift b/Tests/IPCamKitTests/DescribeParserTests.swift index 303fbe6..8793d85 100644 --- a/Tests/IPCamKitTests/DescribeParserTests.swift +++ b/Tests/IPCamKitTests/DescribeParserTests.swift @@ -686,4 +686,130 @@ struct DescribeParserTests { #expect(setup.source == nil) #expect(setup.serverPort == 49152) } + + // MARK: - Video-less stream configurations (Axis `video=0`) + + @Test("Axis audio-only SDP (no video stream)") + func axisAudioOnlySDP() throws { + let p = try loadDescribe(url: "rtsp://127.0.0.1/", filename: "axis_audio_only_sdp.txt") + #expect(p.streams.count == 1) + let s0 = p.streams[0] + #expect(s0.media == "audio") + #expect(s0.encodingName == "mpeg4-generic") + #expect(s0.clockRateHz == 16000) + #expect(s0.channels == 1) + #expect(p.streams.contains(where: { $0.media == "video" }) == false) + } + + @Test("Axis metadata-only SDP (no video or audio)") + func axisMetadataOnlySDP() throws { + let p = try loadDescribe(url: "rtsp://127.0.0.1/", filename: "axis_metadata_only_sdp.txt") + #expect(p.streams.count == 1) + let s0 = p.streams[0] + #expect(s0.media == "application") + #expect(s0.encodingName == "vnd.onvif.metadata") + #expect(s0.clockRateHz == 90000) + #expect(p.streams.contains(where: { $0.media == "video" }) == false) + #expect(p.streams.contains(where: { $0.media == "audio" }) == false) + } + + @Test("Axis audio + metadata SDP (no video)") + func axisAudioMetadataSDP() throws { + let p = try loadDescribe(url: "rtsp://127.0.0.1/", filename: "axis_audio_metadata_sdp.txt") + #expect(p.streams.count == 2) + #expect(p.streams[0].media == "audio") + #expect(p.streams[0].encodingName == "mpeg4-generic") + #expect(p.streams[1].media == "application") + #expect(p.streams[1].encodingName == "vnd.onvif.metadata") + #expect(p.streams.contains(where: { $0.media == "video" }) == false) + } + + // MARK: - Encoding-support predicates + + @Test("Video encoding support predicate") + func videoEncodingSupport() { + #expect(isVideoEncodingSupported("h264")) + #expect(isVideoEncodingSupported("h265")) + #expect(!isVideoEncodingSupported("jpeg")) + #expect(!isVideoEncodingSupported("vp8")) + #expect(!isVideoEncodingSupported("")) + } + + @Test("Audio encoding support predicate") + func audioEncodingSupport() { + #expect(isAudioEncodingSupported("mpeg4-generic")) + #expect(isAudioEncodingSupported("pcma")) + #expect(isAudioEncodingSupported("pcmu")) + #expect(isAudioEncodingSupported("l16")) + #expect(isAudioEncodingSupported("g722")) + #expect(isAudioEncodingSupported("g723")) + #expect(isAudioEncodingSupported("g726-32")) + #expect(!isAudioEncodingSupported("opus")) + #expect(!isAudioEncodingSupported("speex")) + #expect(!isAudioEncodingSupported("")) + } + + @Test("Application encoding support predicate") + func applicationEncodingSupport() { + #expect(isApplicationEncodingSupported("vnd.onvif.metadata")) + #expect(!isApplicationEncodingSupported("vnd.axis.metadata")) + #expect(!isApplicationEncodingSupported("vnd.hikvision.metadata")) + #expect(!isApplicationEncodingSupported("")) + } + + /// Mirrors the `firstIndex(where:)` filters at the top of `start()`. + /// When all three return `nil`, the pre-PLAY "at least one usable stream" + /// gate throws `sessionSetupFailed`. + private func discoverUsableStreams( + _ p: Presentation + ) -> (video: Int?, audio: Int?, metadata: Int?) { + let v = p.streams.firstIndex { + $0.media == "video" && isVideoEncodingSupported($0.encodingName) + } + let a = p.streams.firstIndex { + $0.media == "audio" && isAudioEncodingSupported($0.encodingName) + } + let m = p.streams.firstIndex { + $0.media == "application" && isApplicationEncodingSupported($0.encodingName) + } + return (v, a, m) + } + + @Test("SDP with only unsupported encodings leaves all stream slots empty") + func unsupportedEncodingsSDP() throws { + let p = try loadDescribe( + url: "rtsp://127.0.0.1/", filename: "unsupported_encodings_sdp.txt") + // Parser tolerates the SDP (3 streams advertised, all unsupported). + #expect(p.streams.count == 3) + #expect(p.streams[0].encodingName == "jpeg") + #expect(p.streams[1].encodingName == "opus") + #expect(p.streams[2].encodingName == "vnd.axis.metadata") + + // None of the three would survive the encoding-support filters in + // `start()` — so the pre-PLAY gate would throw `sessionSetupFailed`. + let usable = discoverUsableStreams(p) + #expect(usable.video == nil) + #expect(usable.audio == nil) + #expect(usable.metadata == nil) + } + + @Test("Axis audio-only SDP yields only the audio slot") + func axisAudioOnlyUsableStreams() throws { + let p = try loadDescribe( + url: "rtsp://127.0.0.1/", filename: "axis_audio_only_sdp.txt") + let usable = discoverUsableStreams(p) + #expect(usable.video == nil) + #expect(usable.audio != nil) + #expect(usable.metadata == nil) + } + + @Test("Axis metadata-only SDP yields only the metadata slot") + func axisMetadataOnlyUsableStreams() throws { + let p = try loadDescribe( + url: "rtsp://127.0.0.1/", filename: "axis_metadata_only_sdp.txt") + let usable = discoverUsableStreams(p) + #expect(usable.video == nil) + #expect(usable.audio == nil) + #expect(usable.metadata != nil) + } } diff --git a/Tests/IPCamKitTests/TestData/axis_audio_metadata_sdp.txt b/Tests/IPCamKitTests/TestData/axis_audio_metadata_sdp.txt new file mode 100644 index 0000000..913a69a --- /dev/null +++ b/Tests/IPCamKitTests/TestData/axis_audio_metadata_sdp.txt @@ -0,0 +1,20 @@ +v=0 +o=- 1620251477190769 1620251477190769 IN IP4 192.168.1.50 +s=Media Presentation +e=NONE +c=IN IP4 0.0.0.0 +b=AS:82 +t=0 0 +a=control:rtsp://192.168.1.50:554/axis-media/media.amp/?video=0&event=on +a=range:npt=0- +m=audio 0 RTP/AVP 97 +b=AS:32 +a=recvonly +a=control:rtsp://192.168.1.50:554/axis-media/media.amp/trackID=2 +a=rtpmap:97 mpeg4-generic/16000/1 +a=fmtp:97 streamtype=5; profile-level-id=2; mode=AAC-hbr; config=1408; sizeLength=13; indexLength=3; indexDeltaLength=3; profile=1; bitrate=32000 +m=application 0 RTP/AVP 107 +b=AS:50 +a=recvonly +a=control:rtsp://192.168.1.50:554/axis-media/media.amp/trackID=3 +a=rtpmap:107 vnd.onvif.metadata/90000 diff --git a/Tests/IPCamKitTests/TestData/axis_audio_only_sdp.txt b/Tests/IPCamKitTests/TestData/axis_audio_only_sdp.txt new file mode 100644 index 0000000..d60eb38 --- /dev/null +++ b/Tests/IPCamKitTests/TestData/axis_audio_only_sdp.txt @@ -0,0 +1,15 @@ +v=0 +o=- 1620251477190769 1620251477190769 IN IP4 192.168.1.50 +s=Media Presentation +e=NONE +c=IN IP4 0.0.0.0 +b=AS:32 +t=0 0 +a=control:rtsp://192.168.1.50:554/axis-media/media.amp/?video=0 +a=range:npt=0- +m=audio 0 RTP/AVP 97 +b=AS:32 +a=recvonly +a=control:rtsp://192.168.1.50:554/axis-media/media.amp/trackID=2 +a=rtpmap:97 mpeg4-generic/16000/1 +a=fmtp:97 streamtype=5; profile-level-id=2; mode=AAC-hbr; config=1408; sizeLength=13; indexLength=3; indexDeltaLength=3; profile=1; bitrate=32000 diff --git a/Tests/IPCamKitTests/TestData/axis_metadata_only_sdp.txt b/Tests/IPCamKitTests/TestData/axis_metadata_only_sdp.txt new file mode 100644 index 0000000..e0232e4 --- /dev/null +++ b/Tests/IPCamKitTests/TestData/axis_metadata_only_sdp.txt @@ -0,0 +1,14 @@ +v=0 +o=- 1620251477190769 1620251477190769 IN IP4 192.168.1.50 +s=Media Presentation +e=NONE +c=IN IP4 0.0.0.0 +b=AS:50 +t=0 0 +a=control:rtsp://192.168.1.50:554/axis-media/media.amp/?video=0&audio=0&event=on +a=range:npt=0- +m=application 0 RTP/AVP 107 +b=AS:50 +a=recvonly +a=control:rtsp://192.168.1.50:554/axis-media/media.amp/trackID=3 +a=rtpmap:107 vnd.onvif.metadata/90000 diff --git a/Tests/IPCamKitTests/TestData/unsupported_encodings_sdp.txt b/Tests/IPCamKitTests/TestData/unsupported_encodings_sdp.txt new file mode 100644 index 0000000..a285907 --- /dev/null +++ b/Tests/IPCamKitTests/TestData/unsupported_encodings_sdp.txt @@ -0,0 +1,24 @@ +v=0 +o=- 1620251477190769 1620251477190769 IN IP4 192.168.1.50 +s=Media Presentation +e=NONE +c=IN IP4 0.0.0.0 +b=AS:200 +t=0 0 +a=control:rtsp://192.168.1.50:554/unsupported/ +a=range:npt=0- +m=video 0 RTP/AVP 26 +b=AS:150 +a=recvonly +a=control:rtsp://192.168.1.50:554/unsupported/trackID=1 +a=rtpmap:26 JPEG/90000 +m=audio 0 RTP/AVP 96 +b=AS:48 +a=recvonly +a=control:rtsp://192.168.1.50:554/unsupported/trackID=2 +a=rtpmap:96 opus/48000/2 +m=application 0 RTP/AVP 107 +b=AS:50 +a=recvonly +a=control:rtsp://192.168.1.50:554/unsupported/trackID=3 +a=rtpmap:107 vnd.axis.metadata/90000 From 2d3f19643566f3e87a8936b6308ec78778e3e868 Mon Sep 17 00:00:00 2001 From: Anees Iqbal Date: Wed, 20 May 2026 17:35:46 +0200 Subject: [PATCH 3/3] :memo: Document optional streams and metadata in API.md [ci skip] - README: note that audio-only / metadata-only sessions are supported. - API.md: document the new VideoStream / AudioStream substructs and update the Quick Start snippet to the new shape (was still showing the flat field names and was missing the `.metadata` switch case). Also fills two pre-existing gaps that the metadata PR (#5) didn't catch: adds the metadataEncoding field, the `.metadata` PublicCodecItem case, and a PublicMetadataFrame definition. - CHANGELOG: add Upcoming entries for the breaking SessionDescription shape change (Breaking changes), the ONVIF analytics metadata stream support that #5 forgot to document (New), and the audio-init-failure state-coherence fix under Fixes (rather than New, since it's an internal cleanup not a user-visible feature). --- API.md | 46 +++++++++++++++++++++++++++++++++++++--------- CHANGELOG.md | 12 ++++++++++++ README.md | 11 ++++++++--- 3 files changed, 57 insertions(+), 12 deletions(-) diff --git a/API.md b/API.md index 6eaa88a..5771f13 100644 --- a/API.md +++ b/API.md @@ -10,7 +10,9 @@ let session = RTSPClientSession( credentials: Credentials(username: "admin", password: "pass")) let desc = try await session.start() -// desc.videoCodec, desc.resolution, desc.audioCodec, etc. +// desc.video?.codec / .resolution / .sps / .pps / .vps / .clockRate +// desc.audio?.codec / .sampleRate / .channels / .extraData +// desc.metadataEncoding (e.g. "vnd.onvif.metadata") for try await item in session.frames() { switch item { @@ -18,6 +20,8 @@ for try await item in session.frames() { // frame.nalus — AVCC NAL units for VideoToolbox case .audio(let frame): // frame.data — raw audio (PCMA/PCMU/AAC/etc.) + case .metadata(let frame): + // frame.data — ONVIF analytics XML (possibly GZIP-compressed) case .rtcp: break } @@ -82,19 +86,29 @@ enum Transport: Sendable { Returned by `start()` with stream metadata parsed from SDP. +At least one of `video`, `audio`, or `metadataEncoding` is non-`nil`; a session with zero usable streams is rejected at `start()`. + ```swift struct SessionDescription: Sendable { - let videoCodec: VideoCodec - let sps: Data - let pps: Data + let video: VideoStream? + let audio: AudioStream? + let metadataEncoding: String? // e.g. "vnd.onvif.metadata" +} + +struct VideoStream: Sendable { + let codec: VideoCodec + let clockRate: UInt32 + let sps: Data? // nil until parameters observed + let pps: Data? let vps: Data? // H.265 only let resolution: (width: Int, height: Int)? - let clockRate: UInt32 +} - let audioCodec: PublicAudioCodec? - let audioSampleRate: UInt32? - let audioChannels: UInt16? - let audioExtraData: Data? // e.g. AudioSpecificConfig for AAC +struct AudioStream: Sendable { + let codec: PublicAudioCodec + let sampleRate: UInt32 // Hz + let channels: UInt16? + let extraData: Data? // e.g. AudioSpecificConfig for AAC } ``` @@ -129,6 +143,7 @@ enum PublicAudioCodec: Sendable { enum PublicCodecItem: Sendable { case video(PublicVideoFrame) case audio(PublicAudioFrame) + case metadata(PublicMetadataFrame) case rtcp(PublicRTCPPacket) } ``` @@ -164,6 +179,19 @@ struct PublicAudioFrame: Sendable { } ``` +### PublicMetadataFrame + +Raw payload from an analytics-metadata RTP stream (e.g. ONVIF XML, possibly GZIP-compressed). Consumers handle decoding based on `encodingName`. + +```swift +struct PublicMetadataFrame: Sendable { + let data: Data // Raw payload (depacketized across RTP fragments) + let timestamp: Double // Presentation timestamp in seconds + let encodingName: String // e.g. "vnd.onvif.metadata" + let loss: UInt16 // RTP packets lost before this frame +} +``` + ### PublicRTCPPacket ```swift diff --git a/CHANGELOG.md b/CHANGELOG.md index bbf31cd..86bdd59 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,10 +2,22 @@ ## Upcoming +### Breaking changes + +- `SessionDescription` now groups video and audio fields into `VideoStream?` and `AudioStream?` substructs. Replaces the flat `videoCodec` / `sps` / `pps` / `clockRate` / `audioCodec` / `audioSampleRate` / `audioChannels` / `audioExtraData` fields. A session is valid as long as any one of video, audio, or analytics metadata is set up — audio-only and metadata-only RTSP configurations (e.g. Axis cameras with `video=0`) now work end-to-end. Consumers branch on `desc.video != nil` (or `desc.audio != nil`) before configuring their decoders. + +### New + +- ONVIF analytics metadata stream support (`vnd.onvif.metadata` per the ONVIF Streaming Specification). Surfaced as `PublicCodecItem.metadata(PublicMetadataFrame)` in the `session.frames()` stream, with discoverability via `SessionDescription.metadataEncoding`. Best-effort: malformed metadata SDP or a failed application SETUP degrades to a diagnostic without aborting video/audio. + ### Improvements - Add visionOS 1.0 to supported platforms +### Fixes + +- Audio depacketizer init failures now null out the audio stream state (index, encoding name, clock rate, channels), mirroring the metadata-init failure path. Previously the indices stayed set while the depacketizer was nil; packets on that channel were silently dropped by the dispatch loop but `SessionDescription` could still claim the stream existed. Required to keep the "at least one usable stream" guard honest in audio-only sessions. + ## 0.2.0 ### Breaking changes diff --git a/README.md b/README.md index 96f3cf8..98fc370 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ A pure-Swift RTSP client library for streaming live video and audio from IP came - **H.264 and H.265/HEVC video** — depacketized to AVCC format, ready for VideoToolbox - **Audio** — AAC, PCMU, PCMA, G.722, G.726, L16, G.723.1 - **ONVIF analytics metadata** — raw XML documents from the camera's `application` RTSP stream +- **Optional streams** — any combination of video / audio / metadata is supported; audio-only or metadata-only sessions (e.g. Axis `video=0`) work end-to-end - **Zero dependencies** — only Foundation, Network, and CryptoKit - **Swift 6** — strict concurrency with async/await and AsyncThrowingStream @@ -45,9 +46,13 @@ let session = RTSPClientSession( // Connect and get stream metadata let desc = try await session.start() -// desc.videoCodec, desc.resolution, desc.sps, desc.pps, desc.vps -// desc.audioCodec, desc.audioSampleRate, desc.audioChannels -// desc.metadataEncoding — non-nil if an ONVIF metadata stream is active +// desc.video, desc.audio, desc.metadataEncoding — at least one is non-nil +// desc.video?.codec / .clockRate / .sps / .pps / .vps / .resolution +// desc.audio?.codec / .sampleRate / .channels / .extraData + +if let video = desc.video { + // configure a video decoder (VideoToolbox, etc.) +} // Consume depacketized frames for try await item in session.frames() {