diff options
author | Nick White <git@njw.me.uk> | 2013-05-26 13:06:06 +0100 |
---|---|---|
committer | Nick White <git@njw.me.uk> | 2013-05-26 13:06:06 +0100 |
commit | b0055fe870a04fbd8eaef669c1ccfb0febfe8bc3 (patch) | |
tree | ffaea14138b844e5645ad952a1d54622c15ff7b9 /Makefile | |
parent | b2572f6fa6dbe9e810cbdfcc96f50f04b2fc139c (diff) | |
download | oed2dict-b0055fe870a04fbd8eaef669c1ccfb0febfe8bc3.tar.bz2 oed2dict-b0055fe870a04fbd8eaef669c1ccfb0febfe8bc3.zip |
Convert to proper UTF-8, separate word type stuff from definitions
Diffstat (limited to 'Makefile')
-rw-r--r-- | Makefile | 10 |
1 files changed, 7 insertions, 3 deletions
@@ -4,12 +4,15 @@ SRC = 1.htm 2.htm 3.htm 4.htm 5.htm \ 6.htm 7.htm 8.htm 9.htm 10.htm JARGONS = $(SRC:.htm=.jargon) -$(JARGONS): htmtojargon.awk symbols.sed xmlcleanup.sed xmlcleanup2.sed +$(JARGONS): htmtojargon.awk separateheadmeta.sed symbols.sed \ + xmlcleanup.sed xmlcleanup2.sed all: oed.jargon oed.dict.dz .htm.jargon: - awk -f htmtojargon.awk < "$<" \ + iconv -f ISO-8859-1 -t UTF-8 < "$<" \ + | awk -f htmtojargon.awk \ + | sed -f separateheadmeta.sed \ | sed -f symbols.sed \ | sed -f xmlcleanup.sed \ | sed -f xmlcleanup2.sed \ @@ -24,7 +27,8 @@ oed.jargon: $(JARGONS) oed.dict: oed.jargon cat $< \ | dictfmt -j --utf8 \ - --columns 0 --headword-separator ' ' \ + --columns 0 --headword-separator ',' \ + --index-data-separator ';' \ -u http://njw.me.uk/oed \ -s "Oxford English Dictionary, 2nd Edition" \ oed |