summaryrefslogtreecommitdiff
path: root/xmlcleanup.sed
diff options
context:
space:
mode:
Diffstat (limited to 'xmlcleanup.sed')
-rw-r--r--xmlcleanup.sed39
1 files changed, 39 insertions, 0 deletions
diff --git a/xmlcleanup.sed b/xmlcleanup.sed
new file mode 100644
index 0000000..f302b11
--- /dev/null
+++ b/xmlcleanup.sed
@@ -0,0 +1,39 @@
+#!/bin/sed -f
+
+# unfortunately they use latin-1 rather than ascii...
+# note \x is a gnu extension
+s/\xA3/£/g
+
+# seemingly unneeded control characters
+s/&ff[0-9];//g
+
+# <q> corresponds to a quote
+s/<q>/\n\n/g
+
+# quote text start and end points
+s/<qt>/“/g
+s/<\/qt>/”/g
+
+# new paragraph
+s/&par;/\n\n/g
+
+# a date
+s/<d>//g
+s/<\/d>/:/g
+
+# brackets
+s/&obr;/{/g
+s/&cbr;/}/g
+
+# space
+s/&nbsp;/ /g
+
+# superscript
+s/<sup>/ /g
+s/<\/sup>//g
+
+# xNNNN codes (unknown what the NNNN refers to; not unicode)
+s/<x2145>/`/g
+s/<x2146>/'/g
+s/<x2150>/-/g
+s/<xA043>/../g