diff options
author | Nick White <git@njw.me.uk> | 2013-03-31 18:32:08 +0100 |
---|---|---|
committer | Nick White <git@njw.me.uk> | 2013-03-31 18:32:08 +0100 |
commit | 2742524159ad2bc861711df084f6bd77588e9e9b (patch) | |
tree | 00508f5d2b5232a8d10298acb59facf86f6f5ff9 | |
download | oed2dict-2742524159ad2bc861711df084f6bd77588e9e9b.tar.bz2 oed2dict-2742524159ad2bc861711df084f6bd77588e9e9b.zip |
Initial commit
-rw-r--r-- | BUGS | 5 | ||||
-rw-r--r-- | Makefile | 40 | ||||
-rw-r--r-- | htmtojargon.awk | 17 | ||||
-rw-r--r-- | symbols | 64 | ||||
-rw-r--r-- | symbolstosed.awk | 6 | ||||
-rw-r--r-- | xmlcleanup.sed | 39 | ||||
-rw-r--r-- | xmlcleanup2.sed | 13 |
7 files changed, 184 insertions, 0 deletions
@@ -0,0 +1,5 @@ +Noun, adj / verb (and similar prob) markings after headword are searched against in the index. so searching for "A" or "n" are impossible as they return all adjectives / nouns. + +Search results are not returned in order. May be a limitation of the dict protocol. + +See TODO notes in scripts diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..56e318d --- /dev/null +++ b/Makefile @@ -0,0 +1,40 @@ +# requires dictfmt and dictzip tools (from the package dictfmt) + +PREFIX = /usr + +SRC = 1.htm 2.htm 3.htm 4.htm 5.htm \ + 6.htm 7.htm 8.htm 9.htm 10.htm +JARGONS = $(SRC:.htm=.jargon) + +$(JARGONS): symbols.sed + +all: oed.dict.dz + +.htm.jargon: + awk -f htmtojargon.awk < "$<" \ + | sed -f symbols.sed \ + | sed -f xmlcleanup.sed \ + | sed -f xmlcleanup2.sed \ + > "$@" + +symbols.sed: symbols symbolstosed.awk + awk -f symbolstosed.awk < symbols > $@ + +oed.dict: $(JARGONS) + cat $(JARGONS) \ + | dictfmt -j --utf8 \ + --columns 0 --headword-separator ' ' \ + -u http://njw.me.uk/oed \ + -s "Oxford English Dictionary Second Edition v3" \ + oed + +oed.dict.dz: oed.dict + dictzip -k $< + +install: all + cp oed.dict.dz oed.index $(DESTDIR)$(PREFIX)/share/dictd/ + test -x /usr/sbin/dictdconfig && dictdconfig -w + test -x /etc/init.d/dictd && /etc/init.d/dictd restart + +.SUFFIXES: .htm .jargon .sed .dict .dz +.PHONY: all install diff --git a/htmtojargon.awk b/htmtojargon.awk new file mode 100644 index 0000000..16c5356 --- /dev/null +++ b/htmtojargon.awk @@ -0,0 +1,17 @@ +#!/usr/bin/awk -f +# dirty xml reading is more fun + +BEGIN { + FS = "</hg>" +} + +{ printdefs($1, $2); } + +function printdefs(word, defs) { + n = split(defs, array, "<def>"); + for(i=0; i<=n; i++) { + if(array[i] != "") { + printf(":%s: %s\n", word, array[i]); + } + } +} @@ -0,0 +1,64 @@ +# All symbols can be represented either as &symbol; or _symbol_ so +# parse these with symbolstosed to create sed rules. + +# A large list of these in Help/Advanced/symbols.htm + +# punctuation +revsc · + +# Ligature digraphs +Ae Æ +ae æ +Oe Œ +oe œ + +# Phonetic symbols +aepr ɶ +ccedpr θ +edhpr ð +fata ɑ +lm ː +ope ɛ +revc ɔ +rfa ɒ +schwa ə +sh ʃ +shti ɨ +shtu ɯ +sm ˈ +smpr ˈ +smm ˌ +smmpr ˌ +zh ʝ + +Abarab Ā + +# These are extras which aren't documented in the symbol list + +th th +Th Th + +Aang Å +ouml ö + +oq ‘ +cq ’ + +times × + +# Basic HTML entities +# TODO: generate these from http://www.w3.org/TR/html4/sgml/entities.html or http://www.w3.org/TR/xml-entity-names/ +\#038 \& +aacu á +agrave à +auml ä +eacu é +egrave è +oacu ó +ocirc ô + +# TODO +# edh - unknown +# ygh - some variant of y +# thbar - probably th with bar above +# asg diff --git a/symbolstosed.awk b/symbolstosed.awk new file mode 100644 index 0000000..6eb41f2 --- /dev/null +++ b/symbolstosed.awk @@ -0,0 +1,6 @@ +#!/usr/bin/awk -f + +/^[^#]/{ + printf("s/_%s_/%s/g\n", $1, $2); + printf("s/&%s;/%s/g\n", $1, $2); +} diff --git a/xmlcleanup.sed b/xmlcleanup.sed new file mode 100644 index 0000000..f302b11 --- /dev/null +++ b/xmlcleanup.sed @@ -0,0 +1,39 @@ +#!/bin/sed -f + +# unfortunately they use latin-1 rather than ascii... +# note \x is a gnu extension +s/\xA3/£/g + +# seemingly unneeded control characters +s/&ff[0-9];//g + +# <q> corresponds to a quote +s/<q>/\n\n/g + +# quote text start and end points +s/<qt>/“/g +s/<\/qt>/”/g + +# new paragraph +s/∥/\n\n/g + +# a date +s/<d>//g +s/<\/d>/:/g + +# brackets +s/&obr;/{/g +s/&cbr;/}/g + +# space +s/ / /g + +# superscript +s/<sup>/ /g +s/<\/sup>//g + +# xNNNN codes (unknown what the NNNN refers to; not unicode) +s/<x2145>/`/g +s/<x2146>/'/g +s/<x2150>/-/g +s/<xA043>/../g diff --git a/xmlcleanup2.sed b/xmlcleanup2.sed new file mode 100644 index 0000000..5352a92 --- /dev/null +++ b/xmlcleanup2.sed @@ -0,0 +1,13 @@ +#!/bin/sed -f +# sed rules that must run after the main set + +# <e> tags seem to duplicate <v> tags for some quotes +# NOTE this assumes e tags only come at end of lines +s/<e>.*$//g +# TODO: make this work instead of the above +#s/<e>.*?<\/e>//g + +# any xml tags not processed can just go away +s/<[^>]*>//g +# any xml character entities not processed can just go away +s/&[^;]*;//g |