summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNick White <git@njw.me.uk>2013-03-31 18:32:08 +0100
committerNick White <git@njw.me.uk>2013-03-31 18:32:08 +0100
commit2742524159ad2bc861711df084f6bd77588e9e9b (patch)
tree00508f5d2b5232a8d10298acb59facf86f6f5ff9
downloadoed2dict-2742524159ad2bc861711df084f6bd77588e9e9b.tar.bz2
oed2dict-2742524159ad2bc861711df084f6bd77588e9e9b.zip
Initial commit
-rw-r--r--BUGS5
-rw-r--r--Makefile40
-rw-r--r--htmtojargon.awk17
-rw-r--r--symbols64
-rw-r--r--symbolstosed.awk6
-rw-r--r--xmlcleanup.sed39
-rw-r--r--xmlcleanup2.sed13
7 files changed, 184 insertions, 0 deletions
diff --git a/BUGS b/BUGS
new file mode 100644
index 0000000..01a1916
--- /dev/null
+++ b/BUGS
@@ -0,0 +1,5 @@
+Noun, adj / verb (and similar prob) markings after headword are searched against in the index. so searching for "A" or "n" are impossible as they return all adjectives / nouns.
+
+Search results are not returned in order. May be a limitation of the dict protocol.
+
+See TODO notes in scripts
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..56e318d
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,40 @@
+# requires dictfmt and dictzip tools (from the package dictfmt)
+
+PREFIX = /usr
+
+SRC = 1.htm 2.htm 3.htm 4.htm 5.htm \
+ 6.htm 7.htm 8.htm 9.htm 10.htm
+JARGONS = $(SRC:.htm=.jargon)
+
+$(JARGONS): symbols.sed
+
+all: oed.dict.dz
+
+.htm.jargon:
+ awk -f htmtojargon.awk < "$<" \
+ | sed -f symbols.sed \
+ | sed -f xmlcleanup.sed \
+ | sed -f xmlcleanup2.sed \
+ > "$@"
+
+symbols.sed: symbols symbolstosed.awk
+ awk -f symbolstosed.awk < symbols > $@
+
+oed.dict: $(JARGONS)
+ cat $(JARGONS) \
+ | dictfmt -j --utf8 \
+ --columns 0 --headword-separator ' ' \
+ -u http://njw.me.uk/oed \
+ -s "Oxford English Dictionary Second Edition v3" \
+ oed
+
+oed.dict.dz: oed.dict
+ dictzip -k $<
+
+install: all
+ cp oed.dict.dz oed.index $(DESTDIR)$(PREFIX)/share/dictd/
+ test -x /usr/sbin/dictdconfig && dictdconfig -w
+ test -x /etc/init.d/dictd && /etc/init.d/dictd restart
+
+.SUFFIXES: .htm .jargon .sed .dict .dz
+.PHONY: all install
diff --git a/htmtojargon.awk b/htmtojargon.awk
new file mode 100644
index 0000000..16c5356
--- /dev/null
+++ b/htmtojargon.awk
@@ -0,0 +1,17 @@
+#!/usr/bin/awk -f
+# dirty xml reading is more fun
+
+BEGIN {
+ FS = "</hg>"
+}
+
+{ printdefs($1, $2); }
+
+function printdefs(word, defs) {
+ n = split(defs, array, "<def>");
+ for(i=0; i<=n; i++) {
+ if(array[i] != "") {
+ printf(":%s: %s\n", word, array[i]);
+ }
+ }
+}
diff --git a/symbols b/symbols
new file mode 100644
index 0000000..797961c
--- /dev/null
+++ b/symbols
@@ -0,0 +1,64 @@
+# All symbols can be represented either as &symbol; or _symbol_ so
+# parse these with symbolstosed to create sed rules.
+
+# A large list of these in Help/Advanced/symbols.htm
+
+# punctuation
+revsc ·
+
+# Ligature digraphs
+Ae Æ
+ae æ
+Oe Œ
+oe œ
+
+# Phonetic symbols
+aepr ɶ
+ccedpr θ
+edhpr ð
+fata ɑ
+lm ː
+ope ɛ
+revc ɔ
+rfa ɒ
+schwa ə
+sh ʃ
+shti ɨ
+shtu ɯ
+sm ˈ
+smpr ˈ
+smm ˌ
+smmpr ˌ
+zh ʝ
+
+Abarab Ā
+
+# These are extras which aren't documented in the symbol list
+
+th th
+Th Th
+
+Aang Å
+ouml ö
+
+oq ‘
+cq ’
+
+times ×
+
+# Basic HTML entities
+# TODO: generate these from http://www.w3.org/TR/html4/sgml/entities.html or http://www.w3.org/TR/xml-entity-names/
+\#038 \&
+aacu á
+agrave à
+auml ä
+eacu é
+egrave è
+oacu ó
+ocirc ô
+
+# TODO
+# edh - unknown
+# ygh - some variant of y
+# thbar - probably th with bar above
+# asg
diff --git a/symbolstosed.awk b/symbolstosed.awk
new file mode 100644
index 0000000..6eb41f2
--- /dev/null
+++ b/symbolstosed.awk
@@ -0,0 +1,6 @@
+#!/usr/bin/awk -f
+
+/^[^#]/{
+ printf("s/_%s_/%s/g\n", $1, $2);
+ printf("s/&%s;/%s/g\n", $1, $2);
+}
diff --git a/xmlcleanup.sed b/xmlcleanup.sed
new file mode 100644
index 0000000..f302b11
--- /dev/null
+++ b/xmlcleanup.sed
@@ -0,0 +1,39 @@
+#!/bin/sed -f
+
+# unfortunately they use latin-1 rather than ascii...
+# note \x is a gnu extension
+s/\xA3/£/g
+
+# seemingly unneeded control characters
+s/&ff[0-9];//g
+
+# <q> corresponds to a quote
+s/<q>/\n\n/g
+
+# quote text start and end points
+s/<qt>/“/g
+s/<\/qt>/”/g
+
+# new paragraph
+s/&par;/\n\n/g
+
+# a date
+s/<d>//g
+s/<\/d>/:/g
+
+# brackets
+s/&obr;/{/g
+s/&cbr;/}/g
+
+# space
+s/&nbsp;/ /g
+
+# superscript
+s/<sup>/ /g
+s/<\/sup>//g
+
+# xNNNN codes (unknown what the NNNN refers to; not unicode)
+s/<x2145>/`/g
+s/<x2146>/'/g
+s/<x2150>/-/g
+s/<xA043>/../g
diff --git a/xmlcleanup2.sed b/xmlcleanup2.sed
new file mode 100644
index 0000000..5352a92
--- /dev/null
+++ b/xmlcleanup2.sed
@@ -0,0 +1,13 @@
+#!/bin/sed -f
+# sed rules that must run after the main set
+
+# <e> tags seem to duplicate <v> tags for some quotes
+# NOTE this assumes e tags only come at end of lines
+s/<e>.*$//g
+# TODO: make this work instead of the above
+#s/<e>.*?<\/e>//g
+
+# any xml tags not processed can just go away
+s/<[^>]*>//g
+# any xml character entities not processed can just go away
+s/&[^;]*;//g