一种在线词典简易爬虫

Cambridge Dictionary

爬虫规则:

  1. 优先选择 English-Chinese 页面,若没有,则选择 English 页面。
  2. 去除了多余的下划线、“Add to word list”按钮、播放按钮等元素。

使用方法:

  1. 打开 Cambridge Dictionary
  2. 打开浏览器的控制台。方法 1:页面空白位置右键,检查元素;方法 2:按 F12。
  3. 选择 Console 一栏,粘贴下面的代码并运行。
  4. 在新出现的最右边的框中输入需要整理的英文单词,一行一个。
  5. 点击“work”按钮,爬虫开始采集。
  6. 爬虫采集完毕(出现“OK!”字样)时,点击“print”按钮,开始打印页面。
function ready(){
	document.body.innerHTML="<iframe id=\"iframe\" src=\"\" height=\"500px\" width=\"500px\"></iframe><textarea id=\"output\" style=\"height:500px;\" readonly></textarea><textarea id=\"error\" style=\"height:500px;\" readonly></textarea><textarea id=\"input\" style=\"height:500px;\" placeholder=\"input your word list here\"></textarea><button onclick=\"work();\">work</button><button onclick=\"print();\">print</button>";
}
ready();
function delClassName(iframe,name){let a=iframe.getElementsByClassName(name);while(a.length)a[0].parentNode.removeChild(a[0]);}
function eraseUnderlineClassName(iframe,name){let a=iframe.getElementsByClassName(name);for(let i=0;i<a.length;i++){let b=a[i];while(b!=iframe.body){b.style.textDecoration="none";b=b.parentNode;}}}
function copy(val){
	var node_textarea=document.createElement("textarea");
	node_textarea.style.opacity=0;
	node_textarea.value=val;
	node_textarea.readonly=true;
	document.body.appendChild(node_textarea);
	node_textarea.select();
	document.execCommand("Copy");
	document.body.removeChild(node_textarea);
}
var words,id;
function next_word(){
	id++;
	loadIframe();
}
function extract(){
	if(document.getElementById("iframe").src.slice(0,4)!="http")return;
	let iframe=document.getElementById("iframe").contentWindow.document;
	iframe.body.innerHTML=iframe.getElementsByClassName("entry")[0].outerHTML;
	delClassName(iframe,"daud");
	delClassName(iframe,"daccord");
	delClassName(iframe,"dwl");
	delClassName(iframe,"def-info");
	delClassName(iframe,"lb-cm");
	delClassName(iframe,"dphrase-info");
	delClassName(iframe,"phrasal_verb");
	delClassName(iframe,"phrasal_verbs");
	delClassName(iframe,"idiom");
	delClassName(iframe,"idioms");
	delClassName(iframe,"related_word");
	delClassName(iframe,"related_words");
	delClassName(iframe,"dbtn");
	eraseUnderlineClassName(iframe,"query");
	eraseUnderlineClassName(iframe,"dtrans");
	//copy(iframe.body.innerHTML);
	document.getElementById("output").value+=iframe.body.innerHTML;
	next_word();
}
function loadIframe(){
	if(id==words.length||(id==words.length-1&&words[id]=="")){
		document.getElementById("error").value+="OK!\n";
		return;
	}
	let word=words[id];
	document.getElementById("iframe").src="https://dictionary.cambridge.org/dictionary/english-chinese-simplified/"+word;
	setTimeout(function(){
		let iframe=document.getElementById("iframe").contentWindow.document;
		if(iframe.baseURI=="https://dictionary.cambridge.org/dictionary/english-chinese-simplified/"){
			document.getElementById("error").value+="Failed to find English-Chinese interpretation of the word "+word+".\n";
			document.getElementById("iframe").src="https://dictionary.cambridge.org/dictionary/english/"+word;
			setTimeout(function(){
				let iframe=document.getElementById("iframe").contentWindow.document;
				console.log(iframe.baseURI);
				if(iframe.baseURI=="https://dictionary.cambridge.org/dictionary/english/"){
					document.getElementById("error").value+="Failed to find the word "+word+".\n";
					next_word();
				}else extract();
			},3000);
		}else extract();
	},3000);
}
function work(){
	words=document.getElementById("input").value.split("\n");
	id=0;
	console.log(words);
	document.getElementById("output").value="";
	document.getElementById("error").value="";
	loadIframe();
}
function print(){
	let iframe=document.getElementById("iframe").contentWindow;
	iframe.document.body.innerHTML=document.getElementById("output").value;
	iframe.print();
}