一种在线词典简易爬虫
Cambridge Dictionary
爬虫规则:
- 优先选择 English-Chinese 页面,若没有,则选择 English 页面。
- 去除了多余的下划线、“Add to word list”按钮、播放按钮等元素。
使用方法:
- 打开 Cambridge Dictionary。
- 打开浏览器的控制台。方法 1:页面空白位置右键,检查元素;方法 2:按 F12。
- 选择 Console 一栏,粘贴下面的代码并运行。
- 在新出现的最右边的框中输入需要整理的英文单词,一行一个。
- 点击“work”按钮,爬虫开始采集。
- 爬虫采集完毕(出现“OK!”字样)时,点击“print”按钮,开始打印页面。
function ready(){
document.body.innerHTML="<iframe id=\"iframe\" src=\"\" height=\"500px\" width=\"500px\"></iframe><textarea id=\"output\" style=\"height:500px;\" readonly></textarea><textarea id=\"error\" style=\"height:500px;\" readonly></textarea><textarea id=\"input\" style=\"height:500px;\" placeholder=\"input your word list here\"></textarea><button onclick=\"work();\">work</button><button onclick=\"print();\">print</button>";
}
ready();
function delClassName(iframe,name){let a=iframe.getElementsByClassName(name);while(a.length)a[0].parentNode.removeChild(a[0]);}
function eraseUnderlineClassName(iframe,name){let a=iframe.getElementsByClassName(name);for(let i=0;i<a.length;i++){let b=a[i];while(b!=iframe.body){b.style.textDecoration="none";b=b.parentNode;}}}
function copy(val){
var node_textarea=document.createElement("textarea");
node_textarea.style.opacity=0;
node_textarea.value=val;
node_textarea.readonly=true;
document.body.appendChild(node_textarea);
node_textarea.select();
document.execCommand("Copy");
document.body.removeChild(node_textarea);
}
var words,id;
function next_word(){
id++;
loadIframe();
}
function extract(){
if(document.getElementById("iframe").src.slice(0,4)!="http")return;
let iframe=document.getElementById("iframe").contentWindow.document;
iframe.body.innerHTML=iframe.getElementsByClassName("entry")[0].outerHTML;
delClassName(iframe,"daud");
delClassName(iframe,"daccord");
delClassName(iframe,"dwl");
delClassName(iframe,"def-info");
delClassName(iframe,"lb-cm");
delClassName(iframe,"dphrase-info");
delClassName(iframe,"phrasal_verb");
delClassName(iframe,"phrasal_verbs");
delClassName(iframe,"idiom");
delClassName(iframe,"idioms");
delClassName(iframe,"related_word");
delClassName(iframe,"related_words");
delClassName(iframe,"dbtn");
eraseUnderlineClassName(iframe,"query");
eraseUnderlineClassName(iframe,"dtrans");
//copy(iframe.body.innerHTML);
document.getElementById("output").value+=iframe.body.innerHTML;
next_word();
}
function loadIframe(){
if(id==words.length||(id==words.length-1&&words[id]=="")){
document.getElementById("error").value+="OK!\n";
return;
}
let word=words[id];
document.getElementById("iframe").src="https://dictionary.cambridge.org/dictionary/english-chinese-simplified/"+word;
setTimeout(function(){
let iframe=document.getElementById("iframe").contentWindow.document;
if(iframe.baseURI=="https://dictionary.cambridge.org/dictionary/english-chinese-simplified/"){
document.getElementById("error").value+="Failed to find English-Chinese interpretation of the word "+word+".\n";
document.getElementById("iframe").src="https://dictionary.cambridge.org/dictionary/english/"+word;
setTimeout(function(){
let iframe=document.getElementById("iframe").contentWindow.document;
console.log(iframe.baseURI);
if(iframe.baseURI=="https://dictionary.cambridge.org/dictionary/english/"){
document.getElementById("error").value+="Failed to find the word "+word+".\n";
next_word();
}else extract();
},3000);
}else extract();
},3000);
}
function work(){
words=document.getElementById("input").value.split("\n");
id=0;
console.log(words);
document.getElementById("output").value="";
document.getElementById("error").value="";
loadIframe();
}
function print(){
let iframe=document.getElementById("iframe").contentWindow;
iframe.document.body.innerHTML=document.getElementById("output").value;
iframe.print();
}