From b2572f6fa6dbe9e810cbdfcc96f50f04b2fc139c Mon Sep 17 00:00:00 2001 From: Nick White Date: Sun, 12 May 2013 18:29:42 +0100 Subject: Add README, make things a bit more general --- BUGS | 10 +++++++--- Makefile | 13 +++++++------ README | 34 ++++++++++++++++++++++++++++++++++ xmlcleanup.sed | 1 + 4 files changed, 49 insertions(+), 9 deletions(-) create mode 100644 README diff --git a/BUGS b/BUGS index 01a1916..1426aab 100644 --- a/BUGS +++ b/BUGS @@ -1,5 +1,9 @@ -Noun, adj / verb (and similar prob) markings after headword are searched against in the index. so searching for "A" or "n" are impossible as they return all adjectives / nouns. +There is a bug causing some words to not be included at all. Cowboy and chicken are examples. -Search results are not returned in order. May be a limitation of the dict protocol. +Word type (noun, adj, verb, etc.) markings after headword are included in the index, so searching for e.g. "A" or "n" is impossible as it returns all adjectives or nouns. -See TODO notes in scripts +Many of the IPA and other non-ascii symbols aren't yet converted to unicode, and are just discarded. Many (but not all) of these symbols are documented in files in Help/ from the original CDs. + +Search results are not necessarily returned in order. This is probably unavoidable with the dict protocol. + +grep TODO symbols *sed *awk Makefile diff --git a/Makefile b/Makefile index ca485e4..4927483 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,3 @@ -# requires dictfmt and dictzip tools (from the package dictfmt) - PREFIX = /usr SRC = 1.htm 2.htm 3.htm 4.htm 5.htm \ @@ -8,7 +6,7 @@ JARGONS = $(SRC:.htm=.jargon) $(JARGONS): htmtojargon.awk symbols.sed xmlcleanup.sed xmlcleanup2.sed -all: oed.dict.dz +all: oed.jargon oed.dict.dz .htm.jargon: awk -f htmtojargon.awk < "$<" \ @@ -20,12 +18,15 @@ all: oed.dict.dz symbols.sed: symbols symbolstosed.awk awk -f symbolstosed.awk < symbols > $@ -oed.dict: $(JARGONS) - cat $(JARGONS) \ +oed.jargon: $(JARGONS) + cat $(JARGONS) > $@ + +oed.dict: oed.jargon + cat $< \ | dictfmt -j --utf8 \ --columns 0 --headword-separator ' ' \ -u http://njw.me.uk/oed \ - -s "Oxford English Dictionary Second Edition v3" \ + -s "Oxford English Dictionary, 2nd Edition" \ oed oed.dict.dz: oed.dict diff --git a/README b/README new file mode 100644 index 0000000..eaeab74 --- /dev/null +++ b/README @@ -0,0 +1,34 @@ +Oxford English Dictionary, 2nd Edition v3 dict edition +======================================================== + +This is a collection of scripts to convert the Oxford English +Dictionary, 2nd Edition (software version v3) into the jargon +and DICT formats. + +Dependencies +------------ + +- To create DICT files the 'dictfmt' and 'dictzip' tools need to + be installed. + +- The files 1-10.htm from the OED CDs need to be copied into this + directory. Some are under the 'Data/' directory on the CDs. + +Install +------- + +- To create a .jargon file, run 'make oed.jargon' + +- To create a .dict file, run 'make oed.dict', or for a compressed + version 'make oed.dict.dz' + +- To install the dictionary for dictd, run 'make install' + +Note +---- + +The dictionary included in v3 is the same as v4, but with v4 the +OED started encrypting the dictionary files. They probably wouldn't +be difficult to decrypt, but I haven't the time or inclination to +do so. If anyone wants to, let me know how you get on and I'll +very happily incorporate support for it. diff --git a/xmlcleanup.sed b/xmlcleanup.sed index 5f4a594..b67d204 100644 --- a/xmlcleanup.sed +++ b/xmlcleanup.sed @@ -38,4 +38,5 @@ s/<\/sup>//g s//`/g s//'/g s//-/g +s//—/g s//../g -- cgit v1.2.3