-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtxt_from_xml.py
More file actions
32 lines (27 loc) · 942 Bytes
/
txt_from_xml.py
File metadata and controls
32 lines (27 loc) · 942 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import xml.etree.ElementTree as ET
import pathlib
import os
'''
For pulling all the CONTENT element contents from Concordiensis XML files.
Run this from $path parent directory.
Remove dot files (rm .*) before running.
'''
path = input("Path to parent dir")
subDirs = pathlib.Path(path).iterdir()
for subDir in subDirs:
allPagesList = []
for file in sorted(os.listdir(subDir)):
if file.endswith(".xml"):
fullname = os.path.join(subDir, file)
tree = ET.parse(fullname)
root = tree.getroot()
page = list()
for child in root:
word = child.attrib['CONTENT'].strip()
page.append(word)
pageString = " ".join(page)
allPagesList.append(pageString)
allPagesString = "\n\n".join(allPagesList)
allPagesFile = open(f"{subDir}.txt",'w')
allPagesFile.write(allPagesString)
allPagesFile.close()