|
5 | 5 | namespace Farzai\Geonames\Converter; |
6 | 6 |
|
7 | 7 | use Farzai\Geonames\Exceptions\GeonamesException; |
| 8 | +use Generator; |
8 | 9 |
|
9 | 10 | /** |
10 | 11 | * Converts GeoNames gazetteer data from ZIP files to JSON format. |
11 | 12 | * |
12 | 13 | * This converter extracts geographical feature data from GeoNames ZIP archives |
13 | 14 | * and outputs it as a JSON file with administrative code name resolution. |
| 15 | + * Uses streaming to handle large files with minimal memory usage. |
14 | 16 | */ |
15 | 17 | class GazetteerConverter extends AbstractGazetteerConverter |
16 | 18 | { |
17 | 19 | /** |
18 | 20 | * Process the gazetteer data file and write to JSON output. |
19 | 21 | * |
| 22 | + * Uses streaming to process large files with O(1) memory complexity. |
| 23 | + * |
20 | 24 | * @param string $txtFile Path to the source TXT file containing gazetteer data |
21 | 25 | * @param string $outputFile Path to the output JSON file |
22 | 26 | * |
23 | 27 | * @throws GeonamesException When processing fails |
24 | 28 | */ |
25 | 29 | protected function processFile(string $txtFile, string $outputFile): void |
26 | 30 | { |
27 | | - $data = []; |
28 | | - $lines = file($txtFile, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); |
| 31 | + $totalLines = $this->countLines($txtFile); |
| 32 | + $progressBar = $this->createProgressBar($totalLines); |
29 | 33 |
|
30 | | - if ($lines === false) { |
31 | | - throw GeonamesException::fileOperationFailed('read', $txtFile); |
| 34 | + $handle = fopen($outputFile, 'wb'); |
| 35 | + if ($handle === false) { |
| 36 | + throw GeonamesException::fileOperationFailed('open for writing', $outputFile); |
32 | 37 | } |
33 | 38 |
|
34 | | - foreach ($lines as $line) { |
35 | | - if (empty(trim($line))) { |
36 | | - continue; |
37 | | - } |
| 39 | + try { |
| 40 | + $this->writeToHandle($handle, '[', $outputFile); |
| 41 | + $first = true; |
| 42 | + |
| 43 | + foreach ($this->streamGazetteerRecords($txtFile) as $record) { |
| 44 | + if (! $first) { |
| 45 | + $this->writeToHandle($handle, ',', $outputFile); |
| 46 | + } |
| 47 | + |
| 48 | + $json = json_encode($record, JSON_UNESCAPED_UNICODE); |
| 49 | + if ($json === false) { |
| 50 | + throw GeonamesException::fileOperationFailed('encode JSON', $outputFile); |
| 51 | + } |
38 | 52 |
|
39 | | - $record = $this->parseGazetteerLine($line); |
40 | | - if ($record !== null) { |
41 | | - $data[] = $record; |
| 53 | + $this->writeToHandle($handle, $json, $outputFile); |
| 54 | + $first = false; |
| 55 | + |
| 56 | + $progressBar?->advance(); |
42 | 57 | } |
43 | | - } |
44 | 58 |
|
45 | | - $jsonContent = json_encode($data, JSON_PRETTY_PRINT | JSON_UNESCAPED_UNICODE); |
46 | | - if ($jsonContent === false) { |
47 | | - throw GeonamesException::fileOperationFailed('encode JSON', $outputFile); |
| 59 | + $this->writeToHandle($handle, ']', $outputFile); |
| 60 | + } finally { |
| 61 | + fclose($handle); |
| 62 | + $this->finishProgressBar($progressBar); |
48 | 63 | } |
| 64 | + } |
49 | 65 |
|
50 | | - $result = file_put_contents($outputFile, $jsonContent); |
51 | | - if ($result === false) { |
| 66 | + /** |
| 67 | + * Write content to a file handle with error checking. |
| 68 | + * |
| 69 | + * @param resource $handle The file handle to write to |
| 70 | + * @param string $content The content to write |
| 71 | + * @param string $outputFile The output file path (for error messages) |
| 72 | + * |
| 73 | + * @throws GeonamesException When the write operation fails |
| 74 | + */ |
| 75 | + private function writeToHandle($handle, string $content, string $outputFile): void |
| 76 | + { |
| 77 | + if (fwrite($handle, $content) === false) { |
52 | 78 | throw GeonamesException::fileOperationFailed('write', $outputFile); |
53 | 79 | } |
54 | 80 | } |
| 81 | + |
| 82 | + /** |
| 83 | + * Stream gazetteer records from a TXT file. |
| 84 | + * |
| 85 | + * Uses a generator to yield records one at a time, enabling memory-efficient |
| 86 | + * processing of large files. |
| 87 | + * |
| 88 | + * @param string $txtFile Path to the TXT file containing gazetteer data |
| 89 | + * @return Generator<int, array<string, mixed>> Yields gazetteer records |
| 90 | + * |
| 91 | + * @throws GeonamesException When the file cannot be opened |
| 92 | + */ |
| 93 | + protected function streamGazetteerRecords(string $txtFile): Generator |
| 94 | + { |
| 95 | + $handle = fopen($txtFile, 'r'); |
| 96 | + |
| 97 | + if ($handle === false) { |
| 98 | + throw GeonamesException::fileOperationFailed('open', $txtFile); |
| 99 | + } |
| 100 | + |
| 101 | + try { |
| 102 | + while (($line = fgets($handle)) !== false) { |
| 103 | + $trimmedLine = trim($line); |
| 104 | + if (empty($trimmedLine)) { |
| 105 | + continue; |
| 106 | + } |
| 107 | + |
| 108 | + $record = $this->parseGazetteerLine($trimmedLine); |
| 109 | + if ($record !== null) { |
| 110 | + yield $record; |
| 111 | + } |
| 112 | + } |
| 113 | + } finally { |
| 114 | + fclose($handle); |
| 115 | + } |
| 116 | + } |
55 | 117 | } |
0 commit comments