diff options
author | Nick White <git@njw.me.uk> | 2013-05-26 13:06:06 +0100 |
---|---|---|
committer | Nick White <git@njw.me.uk> | 2013-05-26 13:06:06 +0100 |
commit | b0055fe870a04fbd8eaef669c1ccfb0febfe8bc3 (patch) | |
tree | ffaea14138b844e5645ad952a1d54622c15ff7b9 | |
parent | b2572f6fa6dbe9e810cbdfcc96f50f04b2fc139c (diff) | |
download | oed2dict-b0055fe870a04fbd8eaef669c1ccfb0febfe8bc3.tar.bz2 oed2dict-b0055fe870a04fbd8eaef669c1ccfb0febfe8bc3.zip |
Convert to proper UTF-8, separate word type stuff from definitions
-rw-r--r-- | Makefile | 10 | ||||
-rw-r--r-- | separateheadmeta.sed | 2 | ||||
-rw-r--r-- | xmlcleanup.sed | 4 |
3 files changed, 9 insertions, 7 deletions
@@ -4,12 +4,15 @@ SRC = 1.htm 2.htm 3.htm 4.htm 5.htm \ 6.htm 7.htm 8.htm 9.htm 10.htm JARGONS = $(SRC:.htm=.jargon) -$(JARGONS): htmtojargon.awk symbols.sed xmlcleanup.sed xmlcleanup2.sed +$(JARGONS): htmtojargon.awk separateheadmeta.sed symbols.sed \ + xmlcleanup.sed xmlcleanup2.sed all: oed.jargon oed.dict.dz .htm.jargon: - awk -f htmtojargon.awk < "$<" \ + iconv -f ISO-8859-1 -t UTF-8 < "$<" \ + | awk -f htmtojargon.awk \ + | sed -f separateheadmeta.sed \ | sed -f symbols.sed \ | sed -f xmlcleanup.sed \ | sed -f xmlcleanup2.sed \ @@ -24,7 +27,8 @@ oed.jargon: $(JARGONS) oed.dict: oed.jargon cat $< \ | dictfmt -j --utf8 \ - --columns 0 --headword-separator ' ' \ + --columns 0 --headword-separator ',' \ + --index-data-separator ';' \ -u http://njw.me.uk/oed \ -s "Oxford English Dictionary, 2nd Edition" \ oed diff --git a/separateheadmeta.sed b/separateheadmeta.sed new file mode 100644 index 0000000..9dca6c6 --- /dev/null +++ b/separateheadmeta.sed @@ -0,0 +1,2 @@ +# this replaces the first <i> in a headword with "; " +/^:[^:]*:/ s/<i>/; / diff --git a/xmlcleanup.sed b/xmlcleanup.sed index b67d204..c757859 100644 --- a/xmlcleanup.sed +++ b/xmlcleanup.sed @@ -1,9 +1,5 @@ #!/bin/sed -f -# unfortunately they use latin-1 rather than ascii... -# note \x is a gnu extension -s/\xA3/£/g - # seemingly unneeded control characters s/&ff[0-9];//g s/xxff0//g |