AI 143689: am: CL 143659 am: CL 143472 Reduce dictionary size.
Changed the tree structure to have variable length nodes to save an average of 21% on the dictionary size. Created a shortened English dictionary for Dream - 50K words. Added a shortened Spanish dictionary for Dream - 32K words. Original author: yamasani Merged from: //branches/cupcake/... Original author: android-build Merged from: //branches/donutburger/... Automated import of CL 143689
This commit is contained in:
committed by
The Android Open Source Project
parent
153b961cb6
commit
ebaef5ec19
@@ -45,6 +45,10 @@ public class MakeBinaryDictionary {
|
|||||||
public static final String TAG_WORD = "w";
|
public static final String TAG_WORD = "w";
|
||||||
public static final String ATTR_FREQ = "f";
|
public static final String ATTR_FREQ = "f";
|
||||||
|
|
||||||
|
private static final int FLAG_ADDRESS_MASK = 0x400000;
|
||||||
|
private static final int FLAG_TERMINAL_MASK = 0x800000;
|
||||||
|
private static final int ADDRESS_MASK = 0x3FFFFF;
|
||||||
|
|
||||||
public static final CharNode EMPTY_NODE = new CharNode();
|
public static final CharNode EMPTY_NODE = new CharNode();
|
||||||
|
|
||||||
List<CharNode> roots;
|
List<CharNode> roots;
|
||||||
@@ -179,7 +183,7 @@ public class MakeBinaryDictionary {
|
|||||||
parent.children.add(child);
|
parent.children.add(child);
|
||||||
}
|
}
|
||||||
child.data = data;
|
child.data = data;
|
||||||
child.freq += occur;
|
if (child.freq == 0) child.freq = occur;
|
||||||
if (word.length() > charAt + 1) {
|
if (word.length() > charAt + 1) {
|
||||||
addWordRec(child, word, charAt + 1, occur);
|
addWordRec(child, word, charAt + 1, occur);
|
||||||
} else {
|
} else {
|
||||||
@@ -195,56 +199,76 @@ public class MakeBinaryDictionary {
|
|||||||
static final int ADDR_WIDTH = 23; // Offset to children
|
static final int ADDR_WIDTH = 23; // Offset to children
|
||||||
static final int FREQ_WIDTH_BYTES = 1;
|
static final int FREQ_WIDTH_BYTES = 1;
|
||||||
static final int COUNT_WIDTH_BYTES = 1;
|
static final int COUNT_WIDTH_BYTES = 1;
|
||||||
static final int NODE_SIZE_BYTES =
|
|
||||||
(CHAR_WIDTH + FLAGS_WIDTH + ADDR_WIDTH) / 8 + FREQ_WIDTH_BYTES;
|
|
||||||
|
|
||||||
private void addCount(int count) {
|
private void addCount(int count) {
|
||||||
dict[dictSize++] = (byte) (0xFF & count);
|
dict[dictSize++] = (byte) (0xFF & count);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* TODO: Allow characters to be beyond the 0-255 range. This is required for some latin
|
|
||||||
language not currently supported */
|
|
||||||
private void addNode(CharNode node) {
|
private void addNode(CharNode node) {
|
||||||
int charData = 0xFFFF & node.data;
|
int charData = 0xFFFF & node.data;
|
||||||
if (charData > 254) {
|
if (charData > 254) {
|
||||||
System.out.println("WARNING: Non-ASCII character encountered : " + node.data +
|
dict[dictSize++] = (byte) 255;
|
||||||
", value = " + charData);
|
dict[dictSize++] = (byte) ((node.data >> 8) & 0xFF);
|
||||||
dict[dictSize++] = '@';
|
dict[dictSize++] = (byte) (node.data & 0xFF);
|
||||||
} else {
|
} else {
|
||||||
dict[dictSize++] = (byte) (0xFF & node.data);
|
dict[dictSize++] = (byte) (0xFF & node.data);
|
||||||
}
|
}
|
||||||
dictSize += 3; // Space for children address
|
if (node.children != null) {
|
||||||
if ((0xFFFFFF & node.freq) > 255) {
|
dictSize += 3; // Space for children address
|
||||||
node.freq = (byte) 255;
|
} else {
|
||||||
|
dictSize += 1; // Space for just the terminal/address flags
|
||||||
|
}
|
||||||
|
if ((0xFFFFFF & node.freq) > 255) {
|
||||||
|
node.freq = 255;
|
||||||
|
}
|
||||||
|
if (node.terminal) {
|
||||||
|
byte freq = (byte) (0xFF & node.freq);
|
||||||
|
dict[dictSize++] = freq;
|
||||||
}
|
}
|
||||||
dict[dictSize++] = (byte) (0xFF & node.freq);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int nullChildrenCount = 0;
|
||||||
|
int notTerminalCount = 0;
|
||||||
|
|
||||||
private void updateNodeAddress(int nodeAddress, CharNode node,
|
private void updateNodeAddress(int nodeAddress, CharNode node,
|
||||||
int childrenAddress) {
|
int childrenAddress) {
|
||||||
childrenAddress = 0x7FFFFF & childrenAddress;
|
if ((dict[nodeAddress] & 0xFF) == 0xFF) { // 3 byte character
|
||||||
|
nodeAddress += 2;
|
||||||
|
}
|
||||||
|
childrenAddress = ADDRESS_MASK & childrenAddress;
|
||||||
|
if (childrenAddress == 0) {
|
||||||
|
nullChildrenCount++;
|
||||||
|
} else {
|
||||||
|
childrenAddress |= FLAG_ADDRESS_MASK;
|
||||||
|
}
|
||||||
if (node.terminal) {
|
if (node.terminal) {
|
||||||
childrenAddress |= 0x800000;
|
childrenAddress |= FLAG_TERMINAL_MASK;
|
||||||
|
} else {
|
||||||
|
notTerminalCount++;
|
||||||
}
|
}
|
||||||
dict[nodeAddress + 1] = (byte) (childrenAddress >> 16);
|
dict[nodeAddress + 1] = (byte) (childrenAddress >> 16);
|
||||||
dict[nodeAddress + 2] = (byte) ((childrenAddress & 0xFF00) >> 8);
|
if ((childrenAddress & FLAG_ADDRESS_MASK) != 0) {
|
||||||
dict[nodeAddress + 3] = (byte) ((childrenAddress & 0xFF));
|
dict[nodeAddress + 2] = (byte) ((childrenAddress & 0xFF00) >> 8);
|
||||||
|
dict[nodeAddress + 3] = (byte) ((childrenAddress & 0xFF));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void writeWordsRec(List<CharNode> children) {
|
void writeWordsRec(List<CharNode> children) {
|
||||||
if (children == null || children.size() == 0) {
|
if (children == null || children.size() == 0) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
addCount(children.size());
|
final int childCount = children.size();
|
||||||
int childrenStart = dictSize;
|
addCount(childCount);
|
||||||
for (int j = 0; j < children.size(); j++) {
|
//int childrenStart = dictSize;
|
||||||
|
int[] childrenAddresses = new int[childCount];
|
||||||
|
for (int j = 0; j < childCount; j++) {
|
||||||
CharNode node = children.get(j);
|
CharNode node = children.get(j);
|
||||||
|
childrenAddresses[j] = dictSize;
|
||||||
addNode(node);
|
addNode(node);
|
||||||
}
|
}
|
||||||
for (int j = 0; j < children.size(); j++) {
|
for (int j = 0; j < childCount; j++) {
|
||||||
CharNode node = children.get(j);
|
CharNode node = children.get(j);
|
||||||
// TODO: Fix this when child length becomes variable
|
int nodeAddress = childrenAddresses[j];
|
||||||
int nodeAddress = childrenStart + NODE_SIZE_BYTES * j;
|
|
||||||
int cacheDictSize = dictSize;
|
int cacheDictSize = dictSize;
|
||||||
writeWordsRec(node.children);
|
writeWordsRec(node.children);
|
||||||
updateNodeAddress(nodeAddress, node, node.children != null
|
updateNodeAddress(nodeAddress, node, node.children != null
|
||||||
@@ -253,8 +277,8 @@ public class MakeBinaryDictionary {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void writeToDict(String dictFilename) {
|
void writeToDict(String dictFilename) {
|
||||||
// 2MB max
|
// 4MB max, 22-bit offsets
|
||||||
dict = new byte[2 * 1024 * 1024]; // 2MB upper limit. Actual is probably
|
dict = new byte[4 * 1024 * 1024]; // 4MB upper limit. Actual is probably
|
||||||
// < 1MB in most cases, as there is a limit in the
|
// < 1MB in most cases, as there is a limit in the
|
||||||
// resource size in apks.
|
// resource size in apks.
|
||||||
dictSize = 0;
|
dictSize = 0;
|
||||||
@@ -272,19 +296,29 @@ public class MakeBinaryDictionary {
|
|||||||
void traverseDict(int pos, char[] word, int depth) {
|
void traverseDict(int pos, char[] word, int depth) {
|
||||||
int count = dict[pos++] & 0xFF;
|
int count = dict[pos++] & 0xFF;
|
||||||
for (int i = 0; i < count; i++) {
|
for (int i = 0; i < count; i++) {
|
||||||
char c = (char) (dict[pos] & 0xFF);
|
char c = (char) (dict[pos++] & 0xFF);
|
||||||
word[depth] = c;
|
if (c == 0xFF) {
|
||||||
if ((dict[pos + 1] & 0x80) > 0) {
|
c = (char) (((dict[pos] & 0xFF) << 8) | (dict[pos+1] & 0xFF));
|
||||||
showWord(word, depth + 1, dict[pos + 4] & 0xFF);
|
pos += 2;
|
||||||
|
}
|
||||||
|
word[depth] = c;
|
||||||
|
boolean terminal = (dict[pos] & 0x80) > 0;
|
||||||
|
int address = 0;
|
||||||
|
if ((dict[pos] & (FLAG_ADDRESS_MASK >> 16)) > 0) {
|
||||||
|
address =
|
||||||
|
((dict[pos + 0] & (FLAG_ADDRESS_MASK >> 16)) << 16)
|
||||||
|
| ((dict[pos + 1] & 0xFF) << 8)
|
||||||
|
| ((dict[pos + 2] & 0xFF));
|
||||||
|
pos += 2;
|
||||||
|
}
|
||||||
|
pos++;
|
||||||
|
if (terminal) {
|
||||||
|
showWord(word, depth + 1, dict[pos] & 0xFF);
|
||||||
|
pos++;
|
||||||
}
|
}
|
||||||
int address =
|
|
||||||
((dict[pos + 1] & 0x7F) << 16)
|
|
||||||
| ((dict[pos + 2] & 0xFF) << 8)
|
|
||||||
| ((dict[pos + 3] & 0xFF));
|
|
||||||
if (address != 0) {
|
if (address != 0) {
|
||||||
traverseDict(address, word, depth + 1);
|
traverseDict(address, word, depth + 1);
|
||||||
}
|
}
|
||||||
pos += NODE_SIZE_BYTES;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user