海量資料處理
1.一個檔案中,儲存有10億個單詞(數字+字母組成,每個單詞小於16Byte),每行一個,求出現頻率最高的10個單詞。
演算法一:分而治之 + Hash
- 10億個單詞,不能完全載入到記憶體中處理
- 採用“分而治之”的思想,按照單詞的hash值,將單詞儲存在多個檔案中
- 對於每一個小檔案,可以構建一個單詞為key,出現次數為value的Hash map,同時記錄當前出現次數最高的10個單詞
- 可以得到多個小檔案中的出現次數最多的10個單詞,再依據常規的排序演算法得到總體上出現次數最多的10個單詞
//演算法一:“分而治之 + Hash“的實現 package com.yunfeng; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.FileNotFoundException; import java.io.IOException; import java.util.*; public class BigDataSort{ public final static int FILE_NUM = 10; public static Set<WordEntity> getTop10Word(File file) //獲取單個檔案中,出現頻率最高的10個單詞 { int count = 0; Map<String,Integer> map = new HashMap<String, Integer>(); Set<WordEntity> set = new TreeSet<WordEntity>(); Set<WordEntity> setTop10 = new TreeSet<WordEntity>(); try { BufferedReader br = new BufferedReader(new FileReader(file)); String s = null; while ((s = br.readLine())!= null) { if (map.get(s) == null) { count = 1; } else { count = map.get(s).intValue() + 1; } map.put(s,count); } for (String key : map.keySet()) { set.add(new WordEntity(key,map.get(key))); } count = 1; for (Iterator<WordEntity> it = set.iterator(); it.hasNext(); ) { setTop10.add(it.next()); if (count == 10) break; count++; } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } return setTop10; } public static File[] cutFileByHash(String file) {//切割大檔案,file為包含大資料的檔案 File[] files = new File[FILE_NUM]; BufferedWriter[] brs = new BufferedWriter[FILE_NUM]; try { BufferedReader br = new BufferedReader(new FileReader(file)); for(int i = 0;i < FILE_NUM;i++) { files[i] = new File("file_" + i); brs[i] = new BufferedWriter(new FileWriter(files[i])); } int j = 0;//取模後的值 String s = null; while ((s = br.readLine())!= null) { j = Math.abs(s.hashCode() % FILE_NUM); brs[j].write(s); brs[j].newLine(); } } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } finally { try { for(BufferedWriter b : brs) { b.close(); } } catch (IOException e) { // TODO: handle exception e.printStackTrace(); } } return files; } public static void main(String[] args) { File[] files = cutFileByHash("123.txt");//傳入檔名字 Set<WordEntity> set = new TreeSet<WordEntity>(); for(File file : files) { set.addAll(getTop10Word(file)); } int count = 1; WordEntity wordEntity = null; for (Iterator<WordEntity> it = set.iterator(); it.hasNext(); ) { wordEntity = it.next(); System.out.println("The top ten word is :" + wordEntity.getKey() + ";frequence is " + wordEntity.getCount()); if (count == 10) break; count++; } } } class WordEntity implements Comparable<WordEntity> {//單詞實體 private String key; private Integer count; public WordEntity (String key, Integer count) { this.key = key; this.count = count; } public int compareTo(WordEntity o) { int cmp = count.intValue() - o.count.intValue(); return (cmp == 0 ? key.compareTo(o.key) : -cmp); } public String getKey(){ return key; } public Integer getCount(){ return count; } }