Managing Transcript Files

I’m doing a PhD and as part of that I have interview transcripts to manage. You can read about my workflow for doing that on my PhD site: phd.anthonyarblaster.com/phd/Transcript-Management/. The main way I was able to achieve this was by creating a python script that my shell calls to process the file.

The script is posted on GitHub Gists so it can be used by anyone. The zsh script is very simple as you can see below.

#!/bin/zsh
# Process a Zoom/Teams .vtt transcript for research use
function vtt_process() {
local script="/user/pathname/vtt_process.py"
if [[ -z "$1" ]]; then
echo "Usage: vtt_process <file.vtt> [--self \"Speaker Name\"]"
return 1
fi
python3 "$script" "$@"
}

Hosted with ❤️on GitHub

This calls a python script that turns the VTT file exported from Zoom, essentially just a text file, into something that manages the speakers without all the time block and time separation.

#!/usr/bin/env python3
# vtt_process.py
# Version: 1.0
#
import sys
import re
import argparse
from pathlib import Path
# ─── US → UK spelling map ─────────────────────────────────────────────────────
# Keys are lowercase. Substitution is case-preserving (see preserve_case).
SPELLING = {
# A list of common spelling differences between US and UK English.
# ...
}
# ─── Helpers ──────────────────────────────────────────────────────────────────
def preserve_case(original, replacement):
"""Return replacement with the same capitalisation pattern as original."""
if original.isupper():
return replacement.upper()
if original[0].isupper():
return replacement[0].upper() + replacement[1:]
return replacement
def apply_spelling(text):
"""Replace US spellings with UK equivalents, preserving case."""
# Sort by length descending so longer matches (e.g. 'coloring') beat shorter ones ('color').
pattern = r'\b(' + '|'.join(re.escape(k) for k in sorted(SPELLING, key=len, reverse=True)) + r')\b'
def replacer(match):
word = match.group(0)
return preserve_case(word, SPELLING[word.lower()])
return re.sub(pattern, replacer, text, flags=re.IGNORECASE)
# ─── VTT parsing ──────────────────────────────────────────────────────────────
def parse_vtt(content):
"""
Parse a VTT file into a list of (speaker, text) tuples.
Block numbers and timecode lines are discarded.
"""
blocks = []
# Strip the WEBVTT header line
content = re.sub(r'^WEBVTT[^\n]*\n', '', content, count=1)
for raw_block in re.split(r'\n{2,}', content.strip()):
text_lines = []
for line in raw_block.splitlines():
line = line.strip()
if not line:
continue
if re.match(r'^\d+$', line):
continue # block number
if '-->' in line:
continue # timecode
text_lines.append(line)
if not text_lines:
continue
text = ' '.join(text_lines)
# Split "Speaker Name: utterance"
match = re.match(r'^([^:]+):\s*(.+)$', text, re.DOTALL)
if match:
blocks.append((match.group(1).strip(), match.group(2).strip()))
elif blocks:
# No speaker prefix — treat as continuation of previous speaker
speaker, prev_text = blocks[-1]
blocks[-1] = (speaker, prev_text + ' ' + text)
return blocks
def merge_consecutive_speakers(blocks):
"""Merge adjacent blocks from the same speaker into a single paragraph."""
if not blocks:
return []
merged = [list(blocks[0])]
for speaker, text in blocks[1:]:
if speaker == merged[-1][0]:
merged[-1][1] += ' ' + text
else:
merged.append([speaker, text])
return [tuple(b) for b in merged]
def assign_labels(blocks, self_name):
"""
Replace full speaker names with short labels.
self_name → I; all others → R, R2, R3 …
"""
labels = {}
r_count = 0
for speaker, _ in blocks:
if speaker in labels:
continue
if speaker == self_name:
labels[speaker] = 'I'
else:
r_count += 1
labels[speaker] = 'R' if r_count == 1 else f'R{r_count}'
return [(labels.get(speaker, speaker), text) for speaker, text in blocks]
# ─── Main ─────────────────────────────────────────────────────────────────────
# On line 280, you can change the default Interviewer name.
def main():
parser = argparse.ArgumentParser(
description='Process a Zoom/Teams .vtt transcript for research use.'
)
parser.add_argument('input', help='Path to the .vtt file')
parser.add_argument(
'--self',
dest='self_name',
default='Rose Tyler', # Change the default name here so you don't need to input your name every time.
metavar='NAME',
help='Speaker name to label as I (default: "Rose Tyler")',
)
args = parser.parse_args()
input_path = Path(args.input).expanduser().resolve()
if not input_path.exists():
print(f'Error: file not found: {input_path}', file=sys.stderr)
sys.exit(1)
content = input_path.read_text(encoding='utf-8')
blocks = parse_vtt(content)
blocks = merge_consecutive_speakers(blocks)
blocks = assign_labels(blocks, args.self_name)
blocks = [(speaker, apply_spelling(text)) for speaker, text in blocks]
output = '\n\n'.join(f'{speaker}: {text}' for speaker, text in blocks)
# Strip .vtt and any preceding .transcript suffix before naming the output.
base = input_path.stem # e.g. "foo.transcript"
if base.endswith('.transcript'):
base = base[: -len('.transcript')]
output_path = input_path.parent / (base + '_edited.txt')
output_path.write_text(output + '\n', encoding='utf-8')
print(f'Written to: {output_path}')
speaker_counts = {}
for speaker, _ in blocks:
speaker_counts[speaker] = speaker_counts.get(speaker, 0) + 1
for speaker, count in sorted(speaker_counts.items()):
print(f' {speaker}: {count} turn(s)')
if __name__ == '__main__':
main()

Hosted with ❤️on GitHub

As you can see, there is a huge block which just manages to transform the spelling from US to UK English. An inelegant solution but a solution, none the less.

I didn’t code most of this Python as I have zero idea how to write Python. So I made use of Claude Code. Which worked really well with constrained problem set. There were a lot of revisions to make the processing happen consistently, and the scripts still produce errors. But then the original VTT also has errors and this is really just a way of me being able to get to a transcript I can edit quickly with a minimum of fuss.

Once I have the text file I pull it into MS Word. Where I have some existing styles that I want to apply. As a result, the template file has a Visual Basic macro to apply the styles to every page except my first page. This transforms the paragraphs so that the formatting is consistent throughout my transcript documents.

Sub FormatPrefixParagraphs()
Dim i As Long
Dim txt As String
Dim rng As Range
Dim pg As Long
For i = ActiveDocument.Paragraphs.Count To 1 Step -1
pg = ActiveDocument.Paragraphs(i).Range.Information(wdActiveEndPageNumber)
If pg = 1 Then GoTo NextPara
txt = ActiveDocument.Paragraphs(i).Range.Text
If txt = vbCr Then
ActiveDocument.Paragraphs(i).Range.Delete
ElseIf Left(txt, 3) = "I: " Then
Set rng = ActiveDocument.Paragraphs(i).Range
rng.End = rng.Start + 3
rng.Delete
ActiveDocument.Paragraphs(i).Style = "Interviewer"
ElseIf Left(txt, 3) = "R: " Then
Set rng = ActiveDocument.Paragraphs(i).Range
rng.End = rng.Start + 3
rng.Delete
ActiveDocument.Paragraphs(i).Style = "Respondent"
End If
NextPara:
Next i
End Sub

Having made the document apply formatting, I then had another problem. Word always wants to pull in macros from a template, and you can’t delete them. Which is no good for sharing or archiving. So once again I turned to Claude Code. And produced a script to edit the xml of a document or every word document in a directory.

#!/usr/bin/env python3
# detach_templates.py
import argparse
import os
import re
import shutil
import sys
import tempfile
import zipfile
def diagnose(docx_path):
"""Print raw rels and settings XML so the template reference can be located manually."""
if not zipfile.is_zipfile(docx_path):
print(f" Not a valid ZIP/docx: {docx_path}")
return
files_to_show = [
"word/_rels/document.xml.rels",
"word/_rels/settings.xml.rels",
"word/settings.xml",
]
with zipfile.ZipFile(docx_path, "r") as zin:
names = zin.namelist()
print(f"\n=== Files in archive ===")
for n in sorted(names):
print(f" {n}")
for path in files_to_show:
print(f"\n=== {path} ===")
if path in names:
lines = zin.read(path).decode("utf-8").splitlines()
for line in lines[:60]:
print(line)
else:
print(" (not found)")
def detach_template(docx_path, dry_run=False, backup=True):
"""
Remove the attached template reference from a .docx file.
A .docx is a ZIP archive. The template attachment is stored in
word/_rels/settings.xml.rels as a Relationship element with
Type ending in 'attachedTemplate', and referenced in
word/settings.xml as a w:attachedTemplate element.
Older documents may instead store it in word/_rels/document.xml.rels.
All three locations are checked and cleaned independently.
Returns:
str: Status message describing what happened.
"""
if not zipfile.is_zipfile(docx_path):
return f" SKIPPED (not a valid ZIP/docx): {docx_path}"
doc_rels_path = "word/_rels/document.xml.rels"
settings_rels_path = "word/_rels/settings.xml.rels"
settings_path = "word/settings.xml"
rel_pattern = r'<Relationship[^>]*attachedTemplate[^>]*/>'
settings_pattern = r'<w:attachedTemplate\b[^/]*/>'
try:
with zipfile.ZipFile(docx_path, "r") as zin:
names = zin.namelist()
# Read all three relevant files if present
doc_rels_content = zin.read(doc_rels_path).decode("utf-8") if doc_rels_path in names else None
settings_rels_content = zin.read(settings_rels_path).decode("utf-8") if settings_rels_path in names else None
settings_content = zin.read(settings_path).decode("utf-8") if settings_path in names else None
doc_rels_matches = re.findall(rel_pattern, doc_rels_content, re.DOTALL) if doc_rels_content else []
settings_rels_matches = re.findall(rel_pattern, settings_rels_content, re.DOTALL) if settings_rels_content else []
settings_changed = bool(settings_content and re.search(settings_pattern, settings_content))
if not doc_rels_matches and not settings_rels_matches and not settings_changed:
return f" OK (no template attached): {docx_path}"
# Extract template name for reporting
first_match = (settings_rels_matches or doc_rels_matches or [None])[0]
if first_match:
target = re.search(r'Target="([^"]*)"', first_match)
tname = target.group(1) if target else "unknown"
else:
tname = "unknown (settings.xml only)"
if dry_run:
return f" WOULD DETACH template '{tname}': {docx_path}"
if backup:
shutil.copy2(docx_path, docx_path + ".bak")
# Build cleaned versions of any changed files
new_doc_rels = doc_rels_content
if doc_rels_matches and new_doc_rels:
for m in doc_rels_matches:
new_doc_rels = new_doc_rels.replace(m, "")
new_doc_rels = re.sub(r"\n\s*\n", "\n", new_doc_rels)
new_settings_rels = settings_rels_content
if settings_rels_matches and new_settings_rels:
for m in settings_rels_matches:
new_settings_rels = new_settings_rels.replace(m, "")
new_settings_rels = re.sub(r"\n\s*\n", "\n", new_settings_rels)
new_settings = settings_content
if settings_changed and new_settings:
new_settings = re.sub(settings_pattern, "", new_settings)
new_settings = re.sub(r"\n\s*\n", "\n", new_settings)
# Rewrite the ZIP with modified content
tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
os.close(tmp_fd)
try:
with zipfile.ZipFile(tmp_path, "w", zipfile.ZIP_DEFLATED) as zout:
for item in names:
if item == doc_rels_path and doc_rels_matches:
zout.writestr(item, new_doc_rels)
elif item == settings_rels_path and settings_rels_matches:
zout.writestr(item, new_settings_rels)
elif item == settings_path and settings_changed:
zout.writestr(item, new_settings)
else:
zout.writestr(item, zin.read(item))
shutil.move(tmp_path, docx_path)
except Exception:
if os.path.exists(tmp_path):
os.remove(tmp_path)
raise
return f" DETACHED template '{tname}': {docx_path}"
except zipfile.BadZipFile:
return f" ERROR (corrupt zip): {docx_path}"
except Exception as e:
return f" ERROR ({e}): {docx_path}"
def find_docx_files(path, recursive=False):
"""Find all .docx files at the given path."""
if os.path.isfile(path):
if path.lower().endswith(".docx") and not os.path.basename(path).startswith("~$"):
return [path]
return []
files = []
if recursive:
for root, dirs, filenames in os.walk(path):
# Skip hidden directories
dirs[:] = [d for d in dirs if not d.startswith(".")]
for f in filenames:
if f.lower().endswith(".docx") and not f.startswith("~$"):
files.append(os.path.join(root, f))
else:
for f in os.listdir(path):
if f.lower().endswith(".docx") and not f.startswith("~$"):
files.append(os.path.join(path, f))
return sorted(files)
def main():
parser = argparse.ArgumentParser(
description="Detach templates from Word .docx files to remove inherited macros."
)
parser.add_argument(
"path",
help="Path to a .docx file or a folder containing .docx files",
)
parser.add_argument(
"--recursive", "-r",
action="store_true",
help="Process subfolders recursively",
)
parser.add_argument(
"--dry-run", "-n",
action="store_true",
help="Show what would be changed without modifying anything",
)
parser.add_argument(
"--no-backup",
action="store_true",
help="Skip creating .bak backup files (not recommended)",
)
parser.add_argument(
"--diagnose",
action="store_true",
help="Print raw rels and settings XML to help locate the template reference",
)
args = parser.parse_args()
if not os.path.exists(args.path):
print(f"Error: path not found: {args.path}")
sys.exit(1)
files = find_docx_files(args.path, recursive=args.recursive)
if not files:
print("No .docx files found.")
sys.exit(0)
print(f"Found {len(files)} .docx file(s)")
if args.diagnose:
print("DIAGNOSE mode — printing raw XML, no files will be modified\n")
for f in files:
print(f"\n{'=' * 60}")
print(f"File: {f}")
diagnose(f)
sys.exit(0)
if args.dry_run:
print("DRY RUN — no files will be modified\n")
elif not args.no_backup:
print("Backups will be created (.bak)\n")
detached = 0
skipped = 0
errors = 0
for f in files:
result = detach_template(f, dry_run=args.dry_run, backup=not args.no_backup)
print(result)
if "DETACHED" in result or "WOULD DETACH" in result:
detached += 1
elif "ERROR" in result:
errors += 1
else:
skipped += 1
print(f"\nSummary: {detached} detached, {skipped} skipped, {errors} errors")
if __name__ == "__main__":
main()

Hosted with ❤️on GitHub

This script reset the document template to normal.docx which makes sure that the macro is no longer attached. Stopping the annoying ‘are you sure about macros’ dialog box and making safe for archiving.

I’m not doing a huge number of interviews for this PhD, but even so, this has massively changed how quickly I can get to a document ready for editing. If you want to read more about my PhD you can visit phd.anthonyarblaster.com.