From b0055fe870a04fbd8eaef669c1ccfb0febfe8bc3 Mon Sep 17 00:00:00 2001 From: Nick White Date: Sun, 26 May 2013 13:06:06 +0100 Subject: Convert to proper UTF-8, separate word type stuff from definitions --- Makefile | 10 +++++++--- separateheadmeta.sed | 2 ++ xmlcleanup.sed | 4 ---- 3 files changed, 9 insertions(+), 7 deletions(-) create mode 100644 separateheadmeta.sed diff --git a/Makefile b/Makefile index 4927483..10c4fb3 100644 --- a/Makefile +++ b/Makefile @@ -4,12 +4,15 @@ SRC = 1.htm 2.htm 3.htm 4.htm 5.htm \ 6.htm 7.htm 8.htm 9.htm 10.htm JARGONS = $(SRC:.htm=.jargon) -$(JARGONS): htmtojargon.awk symbols.sed xmlcleanup.sed xmlcleanup2.sed +$(JARGONS): htmtojargon.awk separateheadmeta.sed symbols.sed \ + xmlcleanup.sed xmlcleanup2.sed all: oed.jargon oed.dict.dz .htm.jargon: - awk -f htmtojargon.awk < "$<" \ + iconv -f ISO-8859-1 -t UTF-8 < "$<" \ + | awk -f htmtojargon.awk \ + | sed -f separateheadmeta.sed \ | sed -f symbols.sed \ | sed -f xmlcleanup.sed \ | sed -f xmlcleanup2.sed \ @@ -24,7 +27,8 @@ oed.jargon: $(JARGONS) oed.dict: oed.jargon cat $< \ | dictfmt -j --utf8 \ - --columns 0 --headword-separator ' ' \ + --columns 0 --headword-separator ',' \ + --index-data-separator ';' \ -u http://njw.me.uk/oed \ -s "Oxford English Dictionary, 2nd Edition" \ oed diff --git a/separateheadmeta.sed b/separateheadmeta.sed new file mode 100644 index 0000000..9dca6c6 --- /dev/null +++ b/separateheadmeta.sed @@ -0,0 +1,2 @@ +# this replaces the first in a headword with "; " +/^:[^:]*:/ s//; / diff --git a/xmlcleanup.sed b/xmlcleanup.sed index b67d204..c757859 100644 --- a/xmlcleanup.sed +++ b/xmlcleanup.sed @@ -1,9 +1,5 @@ #!/bin/sed -f -# unfortunately they use latin-1 rather than ascii... -# note \x is a gnu extension -s/\xA3/£/g - # seemingly unneeded control characters s/&ff[0-9];//g s/xxff0//g -- cgit v1.2.3