summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.me.uk>2013-05-12 18:29:42 +0100
committerNick White <git@njw.me.uk>2013-05-12 18:29:42 +0100
commitb2572f6fa6dbe9e810cbdfcc96f50f04b2fc139c (patch)
treeb1831562fc6d76401adb50807b8eff7c00026fd5
parent626ecaf28a3c90da2ac1585414e59b2a376d401d (diff)
downloadoed2dict-b2572f6fa6dbe9e810cbdfcc96f50f04b2fc139c.tar.bz2
oed2dict-b2572f6fa6dbe9e810cbdfcc96f50f04b2fc139c.zip
Add README, make things a bit more general
-rw-r--r--BUGS10
-rw-r--r--Makefile13
-rw-r--r--README34
-rw-r--r--xmlcleanup.sed1
4 files changed, 49 insertions, 9 deletions
diff --git a/BUGS b/BUGS
index 01a1916..1426aab 100644
--- a/BUGS
+++ b/BUGS
@@ -1,5 +1,9 @@
-Noun, adj / verb (and similar prob) markings after headword are searched against in the index. so searching for "A" or "n" are impossible as they return all adjectives / nouns.
+There is a bug causing some words to not be included at all. Cowboy and chicken are examples.
-Search results are not returned in order. May be a limitation of the dict protocol.
+Word type (noun, adj, verb, etc.) markings after headword are included in the index, so searching for e.g. "A" or "n" is impossible as it returns all adjectives or nouns.
-See TODO notes in scripts
+Many of the IPA and other non-ascii symbols aren't yet converted to unicode, and are just discarded. Many (but not all) of these symbols are documented in files in Help/ from the original CDs.
+
+Search results are not necessarily returned in order. This is probably unavoidable with the dict protocol.
+
+grep TODO symbols *sed *awk Makefile
diff --git a/Makefile b/Makefile
index ca485e4..4927483 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,3 @@
-# requires dictfmt and dictzip tools (from the package dictfmt)
-
PREFIX = /usr
SRC = 1.htm 2.htm 3.htm 4.htm 5.htm \
@@ -8,7 +6,7 @@ JARGONS = $(SRC:.htm=.jargon)
$(JARGONS): htmtojargon.awk symbols.sed xmlcleanup.sed xmlcleanup2.sed
-all: oed.dict.dz
+all: oed.jargon oed.dict.dz
.htm.jargon:
awk -f htmtojargon.awk < "$<" \
@@ -20,12 +18,15 @@ all: oed.dict.dz
symbols.sed: symbols symbolstosed.awk
awk -f symbolstosed.awk < symbols > $@
-oed.dict: $(JARGONS)
- cat $(JARGONS) \
+oed.jargon: $(JARGONS)
+ cat $(JARGONS) > $@
+
+oed.dict: oed.jargon
+ cat $< \
| dictfmt -j --utf8 \
--columns 0 --headword-separator ' ' \
-u http://njw.me.uk/oed \
- -s "Oxford English Dictionary Second Edition v3" \
+ -s "Oxford English Dictionary, 2nd Edition" \
oed
oed.dict.dz: oed.dict
diff --git a/README b/README
new file mode 100644
index 0000000..eaeab74
--- /dev/null
+++ b/README
@@ -0,0 +1,34 @@
+Oxford English Dictionary, 2nd Edition v3 dict edition
+========================================================
+
+This is a collection of scripts to convert the Oxford English
+Dictionary, 2nd Edition (software version v3) into the jargon
+and DICT formats.
+
+Dependencies
+------------
+
+- To create DICT files the 'dictfmt' and 'dictzip' tools need to
+ be installed.
+
+- The files 1-10.htm from the OED CDs need to be copied into this
+ directory. Some are under the 'Data/' directory on the CDs.
+
+Install
+-------
+
+- To create a .jargon file, run 'make oed.jargon'
+
+- To create a .dict file, run 'make oed.dict', or for a compressed
+ version 'make oed.dict.dz'
+
+- To install the dictionary for dictd, run 'make install'
+
+Note
+----
+
+The dictionary included in v3 is the same as v4, but with v4 the
+OED started encrypting the dictionary files. They probably wouldn't
+be difficult to decrypt, but I haven't the time or inclination to
+do so. If anyone wants to, let me know how you get on and I'll
+very happily incorporate support for it.
diff --git a/xmlcleanup.sed b/xmlcleanup.sed
index 5f4a594..b67d204 100644
--- a/xmlcleanup.sed
+++ b/xmlcleanup.sed
@@ -38,4 +38,5 @@ s/<\/sup>//g
s/<x2145>/`/g
s/<x2146>/'/g
s/<x2150>/-/g
+s/<x2151>/—/g
s/<xA043>/../g