summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.me.uk>2013-05-26 13:06:06 +0100
committerNick White <git@njw.me.uk>2013-05-26 13:06:06 +0100
commitb0055fe870a04fbd8eaef669c1ccfb0febfe8bc3 (patch)
treeffaea14138b844e5645ad952a1d54622c15ff7b9
parentb2572f6fa6dbe9e810cbdfcc96f50f04b2fc139c (diff)
downloadoed2dict-b0055fe870a04fbd8eaef669c1ccfb0febfe8bc3.tar.bz2
oed2dict-b0055fe870a04fbd8eaef669c1ccfb0febfe8bc3.zip
Convert to proper UTF-8, separate word type stuff from definitions
-rw-r--r--Makefile10
-rw-r--r--separateheadmeta.sed2
-rw-r--r--xmlcleanup.sed4
3 files changed, 9 insertions, 7 deletions
diff --git a/Makefile b/Makefile
index 4927483..10c4fb3 100644
--- a/Makefile
+++ b/Makefile
@@ -4,12 +4,15 @@ SRC = 1.htm 2.htm 3.htm 4.htm 5.htm \
6.htm 7.htm 8.htm 9.htm 10.htm
JARGONS = $(SRC:.htm=.jargon)
-$(JARGONS): htmtojargon.awk symbols.sed xmlcleanup.sed xmlcleanup2.sed
+$(JARGONS): htmtojargon.awk separateheadmeta.sed symbols.sed \
+ xmlcleanup.sed xmlcleanup2.sed
all: oed.jargon oed.dict.dz
.htm.jargon:
- awk -f htmtojargon.awk < "$<" \
+ iconv -f ISO-8859-1 -t UTF-8 < "$<" \
+ | awk -f htmtojargon.awk \
+ | sed -f separateheadmeta.sed \
| sed -f symbols.sed \
| sed -f xmlcleanup.sed \
| sed -f xmlcleanup2.sed \
@@ -24,7 +27,8 @@ oed.jargon: $(JARGONS)
oed.dict: oed.jargon
cat $< \
| dictfmt -j --utf8 \
- --columns 0 --headword-separator ' ' \
+ --columns 0 --headword-separator ',' \
+ --index-data-separator ';' \
-u http://njw.me.uk/oed \
-s "Oxford English Dictionary, 2nd Edition" \
oed
diff --git a/separateheadmeta.sed b/separateheadmeta.sed
new file mode 100644
index 0000000..9dca6c6
--- /dev/null
+++ b/separateheadmeta.sed
@@ -0,0 +1,2 @@
+# this replaces the first <i> in a headword with "; "
+/^:[^:]*:/ s/<i>/; /
diff --git a/xmlcleanup.sed b/xmlcleanup.sed
index b67d204..c757859 100644
--- a/xmlcleanup.sed
+++ b/xmlcleanup.sed
@@ -1,9 +1,5 @@
#!/bin/sed -f
-# unfortunately they use latin-1 rather than ascii...
-# note \x is a gnu extension
-s/\xA3/£/g
-
# seemingly unneeded control characters
s/&ff[0-9];//g
s/xxff0//g