Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Examples/CameraViewer/main.swift
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ final class CameraViewerDelegate: NSObject, NSApplicationDelegate {
layer.enqueue(sample)
}

case .rtcp:
case .metadata, .rtcp:
break
}
}
Expand Down
15 changes: 13 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ A pure-Swift RTSP client library for streaming live video and audio from IP came

- **H.264 and H.265/HEVC video** — depacketized to AVCC format, ready for VideoToolbox
- **Audio** — AAC, PCMU, PCMA, G.722, G.726, L16, G.723.1
- **ONVIF analytics metadata** — raw XML documents from the camera's `application` RTSP stream
- **Zero dependencies** — only Foundation, Network, and CryptoKit
- **Swift 6** — strict concurrency with async/await and AsyncThrowingStream

Expand Down Expand Up @@ -46,6 +47,7 @@ let session = RTSPClientSession(
let desc = try await session.start()
// desc.videoCodec, desc.resolution, desc.sps, desc.pps, desc.vps
// desc.audioCodec, desc.audioSampleRate, desc.audioChannels
// desc.metadataEncoding — non-nil if an ONVIF metadata stream is active

// Consume depacketized frames
for try await item in session.frames() {
Expand All @@ -59,6 +61,10 @@ for try await item in session.frames() {
// frame.data — raw audio bytes (codec-specific)
// frame.codec, frame.sampleRate, frame.channels, frame.timestamp
break
case .metadata(let frame):
// frame.data — raw payload (typically ONVIF XML, possibly GZIP-compressed)
// frame.encodingName, frame.timestamp, frame.loss
break
case .rtcp:
break
}
Expand All @@ -81,6 +87,11 @@ See [API.md](API.md) for the full API reference.
- AAC (RFC 3640) with aggregation and fragmentation
- PCMU (G.711 u-law), PCMA (G.711 A-law), L16, G.722, G.726, DVI4, G.723.1

### Metadata
- ONVIF analytics metadata (`vnd.onvif.metadata`) per the ONVIF Streaming Specification
- Concatenates RTP payload fragments and emits a frame on the marker bit (end-of-document)
- Best-effort: malformed metadata SDP degrades to a diagnostic without aborting video/audio

### Protocol
- RTSP session management (DESCRIBE, SETUP, PLAY, TEARDOWN)
- RTSP message parsing and serialization
Expand Down Expand Up @@ -114,15 +125,15 @@ Sources/IPCamKit/
├── RTSP/ RTSP message model, parser, serializer
├── SDP/ SDP session description parser (RFC 8866)
├── RTP/ RTP/RTCP packets, Timeline, ChannelMapping, InorderParser
├── Codec/ H.264/H.265 depacketizers, NAL/SPS/PPS parsing, audio depacketizers
├── Codec/ H.264/H.265 depacketizers, NAL/SPS/PPS parsing, audio + metadata depacketizers
├── Auth/ Basic and Digest authentication
├── Transport/ NWConnection TCP/UDP transport
└── Client/ RTSP session, DESCRIBE/SETUP/PLAY parsers, Presentation
```

## Testing

90 tests across 15 suites covering RTSP parsing, SDP, RTP, H.264/H.265 depacketization, AAC, simple audio, authentication, and integration:
100+ tests across 15+ suites covering RTSP parsing, SDP, RTP, H.264/H.265 depacketization, AAC, simple audio, ONVIF metadata depacketization, authentication, and integration:

```bash
swift test
Expand Down
166 changes: 164 additions & 2 deletions Sources/IPCamKit/Client/RTSPSession.swift
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ public struct SessionDescription: Sendable {
public let audioChannels: UInt16?
/// Codec-specific extra data (e.g. AudioSpecificConfig for AAC).
public let audioExtraData: Data?

/// SDP encoding name of the analytics-metadata stream if one was set up
/// (e.g. `vnd.onvif.metadata`), or `nil` if no metadata stream is active.
public let metadataEncoding: String?
}

/// RTSP client session that manages the full RTSP lifecycle.
Expand Down Expand Up @@ -139,10 +143,11 @@ public final class RTSPClientSession: Sendable {
}
}

/// A decoded frame (video or audio) exposed to consumers.
/// A decoded frame (video, audio, or metadata) exposed to consumers.
public enum PublicCodecItem: Sendable {
case video(PublicVideoFrame)
case audio(PublicAudioFrame)
case metadata(PublicMetadataFrame)
case rtcp(PublicRTCPPacket)
}

Expand Down Expand Up @@ -171,6 +176,23 @@ public struct PublicAudioFrame: Sendable {
public let loss: UInt16
}

/// A metadata frame from an RTSP `application` stream (typically ONVIF analytics).
public struct PublicMetadataFrame: Sendable {
/// Raw payload bytes. For `vnd.onvif.metadata` this is a UTF-8 XML
/// document with root `tt:MetaDataStream`, optionally GZIP-compressed
/// (consult the SDP `encodingName` for the exact format).
public let data: Data

/// Presentation timestamp in seconds, derived from the RTP timestamp.
public let timestamp: Double

/// SDP encoding name (e.g. `vnd.onvif.metadata`).
public let encodingName: String

/// Number of RTP packets lost before this frame.
public let loss: UInt16
}

/// A video frame exposed to consumers.
public struct PublicVideoFrame: Sendable {
/// NAL units in AVCC format (4-byte big-endian length prefix + NAL bytes).
Expand Down Expand Up @@ -243,12 +265,15 @@ actor SessionState {
private var authenticator: RTSPAuthenticator?
private var depacketizer: VideoDepacketizer?
private var audioDepacketizer: AudioDepacketizer?
private var applicationDepacketizer: ApplicationDepacketizer?
private var url: String?
private var videoStreamIndex: Int?
private var audioStreamIndex: Int?
private var applicationStreamIndex: Int?
private var audioEncodingName: String?
private var audioClockRate: UInt32?
private var audioChannels: UInt16?
private var applicationEncodingName: String?
private var channelMappings = ChannelMappings()
private var inorderParsers: [Int: InorderParser] = [:]
private var userAgent: String?
Expand Down Expand Up @@ -375,6 +400,68 @@ actor SessionState {
audioChannels = audioStream.channels
}

// Find and SETUP application (metadata) stream — optional, best-effort.
// If SETUP fails (camera advertises the stream but rejects it, or any
// transport error), disable metadata locally rather than aborting the
// session. The channel slot stays assigned (no other streams follow).
var applicationIdx = presMut.streams.firstIndex(where: { s in
s.media == "application" && isApplicationEncodingSupported(s.encodingName)
})
var applicationSetupSSRC: UInt32?

if let idx = applicationIdx {
let applicationStream = presMut.streams[idx]
let applicationSetupURL = applicationStream.control ?? url
var applicationSetupHeaders: [(String, String)] = []
if transport == .tcp {
let applicationChannelId = channelMappings.nextUnassigned() ?? 4
applicationSetupHeaders.append(
(
"Transport",
"RTP/AVP/TCP;unicast;interleaved=\(applicationChannelId)-\(applicationChannelId + 1)"
))
try channelMappings.assign(
channelId: applicationChannelId, streamIndex: idx)
} else {
applicationSetupHeaders.append(("Transport", "RTP/AVP;unicast"))
}
if let sid = sessionId {
applicationSetupHeaders.append(("Session", sid))
}

do {
let applicationSetupResp = try await sendRequest(
method: .setup, url: applicationSetupURL,
extraHeaders: applicationSetupHeaders)
let applicationSetup = try parseSetup(response: applicationSetupResp)
if let prev = sessionId, prev != applicationSetup.session.id {
onDiagnostic?(
RTSPDiagnostic(
severity: .warning,
message:
"Camera issued a new Session ID at application SETUP "
+ "(\(prev) -> \(applicationSetup.session.id)); rolling forward."))
}
sessionId = applicationSetup.session.id
applicationSetupSSRC = applicationSetup.ssrc
presMut.streams[idx].state = .setup(
StreamStateInit(
ssrc: applicationSetup.ssrc, initialSeq: nil,
initialRtptime: nil, ctx: .dummy))

applicationStreamIndex = idx
applicationEncodingName = applicationStream.encodingName
} catch {
onDiagnostic?(
RTSPDiagnostic(
severity: .warning,
message:
"Application SETUP failed: \(error); "
+ "metadata will not be delivered."))
applicationIdx = nil
}
}

// PLAY
var playHeaders: [(String, String)] = []
if let sid = sessionId {
Expand Down Expand Up @@ -459,6 +546,44 @@ actor SessionState {
}
}

// Initialize application (metadata) depacketizer + inorder parser.
// Best-effort: if the timeline can't be built (e.g. malformed clock
// rate in SDP), disable the stream rather than failing the session.
if let applicationIdx = applicationIdx {
let applicationStream = presMut.streams[applicationIdx]

var applicationStart: UInt32?
var applicationSeq: UInt16?
var resolvedApplicationSsrc = applicationSetupSSRC

if case .setup(let init_) = presMut.streams[applicationIdx].state {
applicationStart = init_.initialRtptime
if let seq = init_.initialSeq, seq != 0, seq != 1 {
applicationSeq = seq
}
if let s = init_.ssrc { resolvedApplicationSsrc = s }
}

do {
let applicationTimeline = try Timeline(
start: applicationStart, clockRate: applicationStream.clockRateHz)
inorderParsers[applicationIdx] = InorderParser(
ssrc: resolvedApplicationSsrc, nextSeq: applicationSeq,
isTcp: transport == .tcp, timeline: applicationTimeline,
onDiagnostic: onDiagnostic)
applicationDepacketizer = ApplicationDepacketizer(onDiagnostic: onDiagnostic)
} catch {
onDiagnostic?(
RTSPDiagnostic(
severity: .warning,
message:
"Failed to initialize application stream: \(error); "
+ "metadata will not be delivered."))
applicationStreamIndex = nil
applicationEncodingName = nil
}
}

isPlaying = true

// Build session description
Expand Down Expand Up @@ -498,7 +623,8 @@ actor SessionState {
audioCodec: resolvedAudioCodec,
audioSampleRate: resolvedAudioRate,
audioChannels: resolvedAudioChannels,
audioExtraData: audioDepacketizer?.audioParameters?.extraData
audioExtraData: audioDepacketizer?.audioParameters?.extraData,
metadataEncoding: applicationEncodingName
)
}

Expand Down Expand Up @@ -577,6 +703,33 @@ actor SessionState {
}
}
audioDepacketizer = depkt
} else if let applicationIdx = applicationStreamIndex,
mapping.streamIndex == applicationIdx
{
guard var depkt = applicationDepacketizer else { continue }
if let pkt = try parser.rtp(
data: interleaved.data, ctx: .dummy,
streamId: mapping.streamIndex, streamCtx: .dummy)
{
try depkt.push(pkt)
while let result = depkt.pull() {
switch result {
case .success(.metadataFrame(let frame)):
let publicFrame = PublicMetadataFrame(
data: frame.data,
timestamp: frame.timestamp.elapsedSeconds,
encodingName: applicationEncodingName ?? "",
loss: frame.loss
)
continuation.yield(.metadata(publicFrame))
case .failure(let err):
throw RTSPError.depacketizationError("Metadata depacketization failed: \(err)")
default:
break
}
}
}
applicationDepacketizer = depkt
}

inorderParsers[mapping.streamIndex] = parser
Expand Down Expand Up @@ -735,6 +888,15 @@ actor SessionState {
}
}

private func isApplicationEncodingSupported(_ name: String) -> Bool {
switch name {
case "vnd.onvif.metadata":
return true
default:
return false
}
}

private func publicAudioCodec(from encoding: String) -> PublicAudioCodec {
switch encoding {
case "mpeg4-generic": return .aac
Expand Down
Loading
Loading