diff --git a/PretextMap.cpp b/PretextMap.cpp old mode 100644 new mode 100755 index f1d0599..01e9313 --- a/PretextMap.cpp +++ b/PretextMap.cpp @@ -499,134 +499,104 @@ void ProcessBodyLine(void *in) { line_buffer *buffer = (line_buffer *)in; - if (!buffer) { - PrintError("Null buffer passed to ProcessBodyLine"); - Global_Error_Flag = 1; - return; - } - u08 *line = buffer->line; - if (!line) { - PrintError("Null line in buffer"); - Global_Error_Flag = 1; - return; - } ForLoop(buffer->nLines) { u64 nLinesTotal = __atomic_add_fetch(&Total_Reads_Processed, 1, 0); - // Skip readID - while (*line && *line != '\t') line++; - if (!*line) { - PrintError("Malformed line - missing fields after readID"); - Global_Error_Flag = 1; - return; - } - line++; // Skip tab - - // Parse chr1 - u32 contigName1[16] = {0}; - line = PushStringIntoIntArray(contigName1, ArrayCount(contigName1), line, '\t'); - if (!*line) { - PrintError("Malformed line - missing pos1"); - Global_Error_Flag = 1; - return; - } - line++; // Skip tab + while (*line++ != '\t') {} + u32 len = 1; + u32 flags; - // Parse pos1 - char pos1Str[32] = {0}; - u32 pos1Len = 0; - while (line[pos1Len] && line[pos1Len] != '\t' && pos1Len < sizeof(pos1Str)-1) { - pos1Str[pos1Len] = line[pos1Len]; - pos1Len++; - } - if (!pos1Len || line[pos1Len] != '\t') { - PrintError("Malformed line - invalid pos1"); - Global_Error_Flag = 1; - return; + if (File_Type == sam) + { + while (*++line != '\t') ++len; + flags = StringToInt(line, len); } - pos1Str[pos1Len] = '\0'; - - char *endPtr; - errno = 0; - u64 relPos1 = strtoull(pos1Str, &endPtr, 10); - if (errno || *endPtr != '\0') { - PrintError("Invalid pos1 value: %s", pos1Str); - Global_Error_Flag = 1; - return; + else + { + --line; + flags = 0x1; } - line += pos1Len + 1; // Skip number and tab - // Parse chr2 - u32 contigName2[16] = {0}; - line = PushStringIntoIntArray(contigName2, ArrayCount(contigName2), line, '\t'); - if (!*line) { - PrintError("Malformed line - missing pos2"); - Global_Error_Flag = 1; - return; - } - line++; // Skip tab + if ((flags < 128) && (flags & 0x1) && !(flags & 0x4) && !(flags & 0x8)) + { + u32 contigName1[16]; + line = PushStringIntoIntArray(contigName1, ArrayCount(contigName1), ++line, '\t'); - // Parse pos2 - char pos2Str[32] = {0}; - u32 pos2Len = 0; - while (line[pos2Len] && line[pos2Len] != '\t' && pos2Len < sizeof(pos2Str)-1) { - pos2Str[pos2Len] = line[pos2Len]; - pos2Len++; - } - if (!pos2Len) { - PrintError("Malformed line - invalid pos2"); - Global_Error_Flag = 1; - return; - } - pos2Str[pos2Len] = '\0'; - - errno = 0; - u64 relPos2 = strtoull(pos2Str, &endPtr, 10); - if (errno || (*endPtr != '\0' && *endPtr != '\t' && *endPtr != '\n')) { - PrintError("Invalid pos2 value: %s", pos2Str); - Global_Error_Flag = 1; - return; - } + len = 0; + while (*++line != '\t') ++len; + u64 relPos1 = StringToInt64(line, len); - // Skip remaining fields (strand1, strand2) - while (*line && *line != '\n') line++; - if (*line == '\n') line++; + u32 mapq; + if (File_Type == sam) + { + len = 0; + while (*++line != '\t') ++len; + mapq = StringToInt(line, len); + } + else + { + mapq = Min_Map_Quality; + } - // Look up contigs and add to image - contig *cont1 = 0; - if (!ContigHashTableLookup(contigName1, ArrayCount(contigName1), &cont1)) { - if (*(char*)contigName1 != '!') // suppress error for '!' contig - PrintError("Unknown contig: %.64s", (char*)contigName1); - continue; - } + if (mapq >= Min_Map_Quality) + { + if (File_Type == sam) while (*++line != '\t') {} - contig *cont2 = 0; - if (!ContigHashTableLookup(contigName2, ArrayCount(contigName2), &cont2)) { - if (*(char*)contigName2 != '!') // suppress error for '!' contig - PrintError("Unknown contig: %.64s", (char*)contigName2); - continue; - } + u32 sameContig = 0; + u32 contigName2[16]; - // Validate positions - if (relPos1 > cont1->length) { - PrintError("Position %lu exceeds contig length %lu", relPos1, cont1->length); - continue; - } - if (relPos2 > cont2->length) { - PrintError("Position %lu exceeds contig length %lu", relPos2, cont2->length); - continue; - } + if (*++line == '=') + { + sameContig = 1; + ++line; + } + else + { + line = PushStringIntoIntArray(contigName2, ArrayCount(contigName2), line, '\t'); + } - // Calculate absolute genome positions - u64 read1 = cont1->previousCumulativeLength + relPos1; - u64 read2 = cont2->previousCumulativeLength + relPos2; + len = 0; + while (*++line != '\t') ++len; + u64 relPos2 = StringToInt64(line, len); - // Add to contact matrix - AddReadPairToImage(read1, read2); - __atomic_add_fetch(&Total_Good_Reads, 1, 0); + contig *cont = 0; + ContigHashTableLookup(contigName1, ArrayCount(contigName1), &cont); + + if (cont) + { + u64 cummLen1 = cont->previousCumulativeLength; + u64 read1 = cummLen1 + relPos1; + + u64 cummLen2 = 0; + if (sameContig) + { + cummLen2 = cummLen1; + } + else + { + cont = 0; + ContigHashTableLookup(contigName2, ArrayCount(contigName2), &cont); + + if (cont) + { + cummLen2 = cont->previousCumulativeLength; + } + } + + if (cont) + { + u64 read2 = cummLen2 + relPos2; + + AddReadPairToImage(read1, read2); + + __atomic_add_fetch(&Total_Good_Reads, 1, 0); + } + } + } + } if ((nLinesTotal % StatusPrintEvery) == 0) { @@ -634,10 +604,17 @@ ProcessBodyLine(void *in) memset((void *)buff, ' ', 80); buff[80] = 0; printf("\r%s", buff); - stbsp_snprintf(buff, sizeof(buff), "[PretextMap status] :: %$u read-pairs mapped", nLinesTotal); + + if (File_Type == sam) + stbsp_snprintf(buff, sizeof(buff), "[PretextMap status] :: %$u reads processed, %$u read-pairs mapped", nLinesTotal, Total_Good_Reads); + else + stbsp_snprintf(buff, sizeof(buff), "[PretextMap status] :: %$u read-pairs mapped", nLinesTotal); + printf("\r%s", buff); fflush(stdout); } + + while (*line++ != '\n') {} } AddLineBufferToQueue(Line_Buffer_Queue, buffer); @@ -876,6 +853,16 @@ ProcessHeaderLine(u08 *line) line += nameLen; while (*line == ' ' || *line == '\t') line++; + // For SAM format, skip "LN:" prefix before reading length + if (File_Type == sam) { + if (strncmp((char*)line, "LN:", 3) != 0) { + PrintError("Expected 'LN:' after contig name in SAM header line"); + Global_Error_Flag = 1; + return; + } + line += 3; // Skip "LN:" + } + // Parse length value char lengthStr[32] = {0}; int lengthLen = 0; @@ -1418,28 +1405,6 @@ GrabStdIn() // Start of new line if (linePtr == 0) { lineCount++; - // Verify start of pairs format - if (lineCount == 1) { - if (character != '#') { - PrintError("Invalid pairs format: First line must start with '#' but found '0x%02X' (%c)", - character, (character >= 32 && character <= 126) ? character : '?'); - PrintError("Buffer contents at error:"); - for (u32 i = 0; i < 32 && i < readBuffer->size; i++) { - if (i % 16 == 0) fprintf(stderr, "\n%04X: ", i); - fprintf(stderr, "%02X ", readBuffer->buffer[i]); - if ((i + 1) % 16 == 0) { - fprintf(stderr, " "); - for (u32 j = i - 15; j <= i; j++) { - u08 c = readBuffer->buffer[j]; - fprintf(stderr, "%c", (c >= 32 && c <= 126) ? c : '.'); - } - } - } - fprintf(stderr, "\n"); - Global_Error_Flag = 1; - return; - } - } } // Check for buffer overflow before adding character @@ -2312,7 +2277,6 @@ MainArgs u08 highRes = 0; u32 outputNameGiven = 0; - const char *inputFile = NULL; const char *filterIncludeString = 0; const char *filterExcludeString = 0; for ( u32 index = 1; @@ -2415,14 +2379,9 @@ MainArgs { highRes = 1; } - else if (!inputFile && ArgBuffer[index][0] != '-') - { - // First non-option argument is treated as input file - inputFile = ArgBuffer[index]; - } else { - PrintError("Unknown option or multiple input files specified: %s", ArgBuffer[index]); + PrintError("Unknown option: %s", ArgBuffer[index]); exit(EXIT_FAILURE); } } @@ -2467,27 +2426,6 @@ MainArgs PrintStatus("Creating filters"); CreateFilters(filterIncludeString, filterExcludeString); - // Open input file if specified - if (inputFile) { - PrintStatus("Opening input file: %s", inputFile); - int fd = open(inputFile, O_RDONLY); - if (fd < 0) { - PrintError("Failed to open input file '%s': %s", inputFile, strerror(errno)); - exit(EXIT_FAILURE); - } - // Redirect stdin to the input file - if (dup2(fd, STDIN_FILENO) < 0) { - PrintError("Failed to redirect stdin: %s", strerror(errno)); - close(fd); - exit(EXIT_FAILURE); - } - close(fd); - PrintStatus("Successfully opened input file"); - } else { - PrintStatus("Reading from stdin"); - } - - PrintStatus("Starting input processing thread"); ThreadPoolAddTask(Thread_Pool, GrabStdIn, 0); PrintStatus("Waiting for processing to complete"); ThreadPoolWait(Thread_Pool); @@ -2533,4 +2471,4 @@ MainArgs fclose(Output_File); EndMain; -} \ No newline at end of file +} diff --git a/README.md b/README.md index 7c01d16..00bb546 100644 --- a/README.md +++ b/README.md @@ -59,17 +59,17 @@ Note: also filtering with samtools view as in the above example (... seq_1 seq_2 * --highRes: high resolution output, only supported by PretextView >=0.2.5 ## Recent fixes and improvements:
+* **Fixed SAM format header parsing**: Correctly handles SAM format header lines by properly skipping "LN:" prefix when parsing contig lengths from `@SQ` lines +* **Fixed SAM format body line processing**: `ProcessBodyLine` now correctly handles both SAM and pairs formats, with proper parsing of SAM flags, mapping quality, and contig names +* **Improved read buffer error handling**: Enhanced error checking in `FillReadBuffer` with validation for invalid file descriptors and read errors +* **Standardized input handling**: Input is now read exclusively from stdin (via pipes), matching the original design. BAM files should be piped through `samtools view -h` +* **Removed premature format validation**: Fixed issue where SAM format files were incorrectly rejected due to early validation checks * **Improved pairs file input handling**: Enhanced parsing of pairs format files with better error detection and reporting * **Robust header line processing**: Improved parsing of `#chromsize:` header lines in pairs format, with proper handling of format declaration and column lines -* **Enhanced input validation**: Added comprehensive validation for: - - Position values (checking for valid numeric values and bounds) - - Contig name parsing and lookup - - Line format validation with detailed error messages +* **Enhanced input validation**: Added comprehensive validation for position values, contig name parsing, and line format validation with detailed error messages * **Better error handling**: Extensive null pointer checks, buffer overflow protection, and informative error messages throughout the codebase -* **Improved buffer management**: Increased line buffer size for pairs format (1MB) and rewritten `GetNextReadBuffer` function for more reliable file reading +* **Improved buffer management**: Increased line buffer size for pairs format (1MB) and improved `GetNextReadBuffer` function for more reliable file reading * **UTF-8 BOM support**: Automatic detection and skipping of UTF-8 Byte Order Mark (BOM) in input files -* **Position bounds checking**: Validation that read positions do not exceed contig lengths -* **File input support**: Better handling of both stdin and file-based input sources # Map Format Contact maps are saved in a compressed texture format (hence the name). Maps can be read by PretextView (https://github.com/wtsi-hpag/PretextView). Expect pretext map files to take around 30 to 50 M of disk space each. diff --git a/meson.build b/meson.build index 59baa0f..9a0f9c8 100644 --- a/meson.build +++ b/meson.build @@ -1,7 +1,7 @@ project('PretextMap', ['c', 'cpp'], license: 'MIT', meson_version: '>=0.57.1', - version: '0.2.2' + version: '0.2.3' ) deps = [dependency('threads')]