海量資料處理

大資料 · 發表 2018-10-18 00:50:34

摘要： 1.一個檔案中，儲存有10億個單詞（數字+字母組成，每個單詞小於16Byte），每行一個，求出現頻率最高的10個單詞。演算法一：分而治之 + Hash 10億個單詞，不能完全載入到記憶體中處理採用“分而治之”的思想,按照單詞的hash值，將單詞...

1.一個檔案中，儲存有10億個單詞（數字+字母組成，每個單詞小於16Byte），每行一個，求出現頻率最高的10個單詞。

演算法一：分而治之 + Hash

10億個單詞，不能完全載入到記憶體中處理
採用“分而治之”的思想,按照單詞的hash值，將單詞儲存在多個檔案中
對於每一個小檔案，可以構建一個單詞為key，出現次數為value的Hash map，同時記錄當前出現次數最高的10個單詞
可以得到多個小檔案中的出現次數最多的10個單詞，再依據常規的排序演算法得到總體上出現次數最多的10個單詞

//演算法一：“分而治之 + Hash“的實現
package com.yunfeng;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.*;

public class BigDataSort{
public final static int FILE_NUM = 10;
public static Set<WordEntity> getTop10Word(File file) //獲取單個檔案中，出現頻率最高的10個單詞
{
int count = 0;
Map<String,Integer> map = new HashMap<String, Integer>();
Set<WordEntity> set = new TreeSet<WordEntity>();
Set<WordEntity> setTop10 = new TreeSet<WordEntity>();

try
{
BufferedReader br = new BufferedReader(new FileReader(file));
String s = null;
while ((s = br.readLine())!= null) 
{
if (map.get(s) == null) {
count = 1;
} else {
count = map.get(s).intValue() + 1;
}
map.put(s,count);
}

for (String key : map.keySet()) {
set.add(new WordEntity(key,map.get(key)));
}

count = 1;
for (Iterator<WordEntity> it = set.iterator(); it.hasNext(); ) {
setTop10.add(it.next());
if (count == 10)
break;
count++;
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return setTop10;
}

public static File[] cutFileByHash(String file) {//切割大檔案，file為包含大資料的檔案
File[] files = new File[FILE_NUM];
BufferedWriter[] brs = new BufferedWriter[FILE_NUM];

try
{
BufferedReader br = new BufferedReader(new FileReader(file));
for(int i = 0;i < FILE_NUM;i++) {
files[i] = new File("file_" + i);
brs[i] = new BufferedWriter(new FileWriter(files[i]));
}

int j = 0;//取模後的值
String s = null;
while ((s = br.readLine())!= null) 
{
j = Math.abs(s.hashCode() % FILE_NUM);
brs[j].write(s);
brs[j].newLine();
}

} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
finally {
try {
for(BufferedWriter b : brs) {
b.close();
}
}
catch (IOException e) {
// TODO: handle exception
e.printStackTrace();
}
}
return files;
}

public static void main(String[] args) {
File[] files = cutFileByHash("123.txt");//傳入檔名字
Set<WordEntity> set = new TreeSet<WordEntity>();
for(File file : files) {
set.addAll(getTop10Word(file));
}
int count = 1;
WordEntity wordEntity = null;
for (Iterator<WordEntity> it = set.iterator(); it.hasNext(); ) {
wordEntity = it.next();
System.out.println("The top ten word is :" + wordEntity.getKey()
+ ";frequence is " + wordEntity.getCount());
if (count == 10)
break;
count++;
}
}
}

class WordEntity implements Comparable<WordEntity> {//單詞實體
private String key;
private Integer count;
public WordEntity (String key, Integer count) {
this.key = key;
this.count = count;
}

public int compareTo(WordEntity o) {
int cmp = count.intValue() - o.count.intValue();
return (cmp == 0 ? key.compareTo(o.key) : -cmp);
}

public String getKey(){
return key; 
}

public Integer getCount(){
return count;
}
}

海量資料處理

您可能也會喜歡…