开始之前,先看一下从人人网中发现的90后用户爱用的词
是不是很好玩,哈哈。写这篇文章就是让你简单的自动的从文本中找出新的词,这样就知道现在的年轻人喜欢什么了(对于博主这种上了年纪的人来说,真的是很有用,呜呜)
项目结构
当然,text.dat和common.dic这两个文件你可以随意替换,注意text.dat中的数据一定要够份量,否则没啥效果
原理么,看下Matrix67大牛的文章你就懂了
互联网时代的社会语言学:基于SNS的文本数据挖掘
训练数据下载
下边开始上代码
common
这个里边包含以下几个类,主要是定义数据结构
CountMap.java
定义一个计数Map来进行数据操作和持久化
package grid.common; import java.io.Serializable; import java.util.HashMap; public class CountMap<T> extends HashMap<T,Integer> implements Serializable { private static final long serialVersionUID = 6097963798841161750L; public void increase(T t) {//添加元素 Integer count = get(t); if (null == count) { put(t,1); } else { put(t,++count); } } public int count() { //计数 int count = 0; for (T t : keySet()) { count += get(t); } return count; } public int get(char c) { Integer count = super.get(c); return null == count ? 0 : count; } }
Node.java
定义语法树的节点
package grid.common; import java.util.ArrayList; import java.util.Collections; import java.util.List; public class Node<T> { protected List<Node<T>> children; protected Node<T> parent; protected T value; Node(T value) { this.value = value; } public Node<T> add(T value) { if (null == children) { children = new ArrayList<Node<T>>(); } Node<T> child = new Node<T>(value); child.setParent(this); children.add(child); return child; } public T getValue() { return value; } public Node<T> getParent() { return parent; } public void setParent(Node<T> parent) { this.parent = parent; } private void recurseChildren(List<Node<T>> list,Node<T> parent) { if (null == parent.children) { list.add(parent); } else { for (Node<T> node : parent.children) { recurseChildren(list,node); } } } public List<Node<T>> getLeaves() { List<Node<T>> list = new ArrayList<Node<T>>(); recurseChildren(list,this); return list; } public List<T> getBranchPath() { List<T> list = new ArrayList<T>(); Node<T> node = this; do { list.add(node.getValue()); node = node.parent; } while (null != node && !(node instanceof Tree<?>)); Collections.reverse(list); return list; } private void append(StringBuilder builder,int deep,Node<T> node) { for (int i = 0; i < deep; i++) { builder.append(" "); } builder.append("|--"); builder.append(node.getValue()); builder.append("\n"); if (null != node.children) { for (Node<T> child : node.children) { append(builder,deep + 1,child); } } } public String dump() { StringBuilder builder = new StringBuilder(); append(builder,0,this); return builder.toString(); } public String toString() { return value.toString(); } }
TextDatReader.java
读取训练数据
package grid.common; import java.io.File; import java.io.FileReader; import java.io.IOException; public class TextDatReader { public static String read(String path) throws IOException { File file = new File(path); FileReader reader = new FileReader(file); char buffer[] = new char[(int) file.length()]; reader.read(buffer); return new String(buffer); } }
TextUtils.java
#p#分页标题#e#
用来做文本处理,如判断是否为空、匹配字符等
package grid.common; public class TextUtils { public static boolean isCnLetter(char c) { return c >= 0x4E00 && c <= 0x9FCB; } public static boolean isNumeric(char c) { return c >= '0' && c <= '9'; } public static boolean isEnLetter(char c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } public static boolean match(String src,int off,String dest) { int len = dest.length(); int srcLen = src.length(); for (int i = 0; i < len; i++) { if (srcLen <= off + i) { return false; } if (dest.charAt(i) != src.charAt(off + i)) { return false; } } return true; } public static boolean isBlank(String str) { return null == str || str.isEmpty() || str.trim().isEmpty(); } }
Tree.java
语法树
package grid.common; public class Tree<T> extends Node<T> { public Tree(T value) { super(value); } }
dic
里边包含CnDictionary类
CnDictionary.java
词典处理
package grid.text.dic; import grid.common.CountMap; import grid.common.TextDatReader; import grid.common.TextUtils; import java.io.IOException; import java.util.HashSet; import java.util.Set; public class CnDictionary { private final String COMMON_WORD_DIC_PATH = "common.dic"; /** * This text data is for character statistic. Change to your own if you * like. */ private final String COMMON_LETTER_RESOURCE_PATH = "text.dat"; private Set<String> dictionary = new HashSet<String>(); private CountMap<Character> letterCountMap = new CountMap<Character>(); private int totalLetterCount; private static CnDictionary instance; //单例模式 public static CnDictionary Instance() { if (null == instance) { try { instance = new CnDictionary(); } catch (IOException e) { e.printStackTrace(); } } return instance; } private CnDictionary() throws IOException { initWordDic(); initLetterCountMap(); } private void initLetterCountMap() throws IOException { String letterResource = TextDatReader.read(COMMON_LETTER_RESOURCE_PATH);//读取语料数据 text.dat final int len = letterResource.length(); char c; for (int i = 0; i < len; i++) { c = letterResource.charAt(i); if (TextUtils.isCnLetter(c)) { letterCountMap.increase(c); } } totalLetterCount = letterCountMap.count(); } private void initWordDic() throws IOException { String bytes = TextDatReader.read(COMMON_WORD_DIC_PATH);//读取词典commondic final int len = bytes.length(); String s = ""; char c; for (int i = 0; i < len; i++) { c = bytes.charAt(i); if ('\n' == c || '\r' == c || 0 == c) { if (!TextUtils.isBlank(s)) { dictionary.add(s.trim()); } s = ""; } else { s += c; } if (0 == c) { break; } } } public boolean contains(String word) { return dictionary.contains(word); } public double rate(char c) { return (double) letterCountMap.get(c) / totalLetterCount; } public int size() { return dictionary.size(); } }