开发者体验：Java抓取百度Top500歌曲及源码-51CTO.COM

主要的工作就是如何通过Java抓取***的Baidu好听的歌曲，Java抓取的工作主要包括3个属性：歌名、歌曲在线播放地址和歌词内容(符合LRC歌词格式)，目前完成歌曲和歌曲地址抓取，由于百度的歌曲地址很多通过js获取，所以歌曲地址获取我这里使用搜狗音乐搜索方便些，所有的源码如下：
/** *//**   
　　http://www.bt285.cn http://www.5a520.cn   
　　*/   
　　package com.common.utils;   
　　import Java.io.BufferedReader;   
　　import java.io.ByteArrayOutputStream;   
　　import java.io.IOException;   
　　import java.io.InputStream;   
　　import java.io.InputStreamReader;   
　　import java.io.OutputStreamWriter;   
　　import java.io.UnsupportedEncodingException;   
　　import java.net.HttpURLConnection;   
　　import java.net.MalformedURLException;   
　　import java.net.URL;   
　　import java.net.URLConnection;   
　　import java.net.URLDecoder;   
　　import java.net.URLEncoder;   
　　import java.util.ArrayList;   
　　import java.util.HashSet;   
　　import java.util.List;   
　　import java.util.Set;   
　　import java.util.TreeSet;   
　　import java.util.regex.Matcher;   
　　import java.util.regex.Pattern;   
　　import org.htmlparser.Node;   
　　import org.htmlparser.NodeFilter;   
　　import org.htmlparser.Parser;   
　　import org.htmlparser.filters.NodeClassFilter;   
　　import org.htmlparser.filters.OrFilter;   
　　import org.htmlparser.nodes.TextNode;   
　　import org.htmlparser.tags.LinkTag;   
　　import org.htmlparser.util.NodeList;   
　　import org.htmlparser.util.ParserException;   
　　import com.common.doc.FileOperUtils;   
　　class Song{   
　　private String name;   
　　private String url;   
　　private String lrc;   
　　public Song(String name,String url){   
　　this.name = name;   
　　this.url = url;   
　　this.lrc = "";   
　　}   
　　public String getName() {   
　　return name;   
　　}   
　　public void setName(String name) {   
　　this.name = name;   
　　}   
　　public String getUrl() {   
　　return url;   
　　}   
　　public void setUrl(String url) {   
　　this.url = url;   
　　}   
　　public String getLrc() {   
　　return lrc;   
　　}   
　　public void setLrc(String lrc) {   
　　this.lrc = lrc;   
　　}   
　　}   
　　public class BaiduMP3 {   
　　public static String visitURL(String strUrl) {   
　　URL url = null;   
　　try {   
　　url = new URL(strUrl);   
　　} catch (MalformedURLException e) {   
　　e.printStackTrace();   
　　}   
　　URLConnection conn = null;   
　　try {   
　　conn = url.openConnection();   
　　conn.setDoOutput(true);   
　　} catch (IOException e) {   
　　System.out.println("e:"+e.getMessage());   
　　}   
　　OutputStreamWriter out;   
　　try {   
　　out = new OutputStreamWriter(conn.getOutputStream(), "GBK");   
　　out.flush();   
　　out.close();   
　　} catch (UnsupportedEncodingException e2) {   
　　e2.printStackTrace();   
　　} catch (IOException e2) {   
　　e2.printStackTrace();   
　　}   
　　// 接收返回信息   
　　BufferedReader rd = null;   
　　try {   
　　rd = new BufferedReader(   
　　new InputStreamReader(conn.getInputStream()));   
　　return rd.readLine();   
　　} catch (IOException e1) {   
　　e1.printStackTrace();   
　　}   
　　return "";   
　　}   
　　/** *//**   
　　* 功能说明：访问指定的URL并检查返回结果。   
　　* @param strUrl   
　　* @param successFlag 请求成功的标识，比如包含“_SUCCESS”字。   
　　* @return   
　　*/   
　　public static String visitURL(String strUrl, String successFlag) {   
　　boolean rs = false;   
　　HttpURLConnection jconn = null;   
　　ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();   
　　try {   
　　URL url = new URL(strUrl);   
　　jconn = (HttpURLConnection) url.openConnection();   
　　jconn.setDoOutput(true);   
　　jconn.setDoInput(true);   
　　jconn.connect();   
　　InputStream in = jconn.getInputStream();   
　　byte[] buf = new byte[4096];   
　　int bytesRead;   
　　while ((bytesRead = in.read(buf)) != -1) {   
　　byteArrayOutputStream.write(buf, 0, bytesRead);   
　　}   
　　String strRead = new String(byteArrayOutputStream.toByteArray(),"GBK");   
　　return strRead;   
　　} catch (MalformedURLException e) {   
　　e.printStackTrace();   
　　} catch (IOException e) {   
　　e.printStackTrace();   
　　} finally {   
　　jconn.disconnect();   
　　try {   
　　byteArrayOutputStream.close();   
　　} catch (IOException e) {   
　　e.printStackTrace();   
　　}   
　　}   
　　return "";   
　　}   
　　private static boolean isTrimEmptyOrBlank(String astr) {   
　　if ((null == astr) || (astr.length() == 0) || " ".equals(astr)) {   
　　return true;   
　　}   
　　astrastr = astr.trim();   
　　if ((null == astr) || (astr.length() == 0)) {   
　　return true;   
　　}   
　　return false;   
　　}   
　　private static String getFilteredContent(String htmlContent, String reg,int i) {   
　　String content = "";   
　　int k=1;   
　　Pattern pp = Pattern.compile(reg, Pattern.DOTALL);   
　　Matcher m = pp.matcher(htmlContent);   
　　while (m.find()) {   
　　content = m.group();   
　　if(k++==i)   
　　break;   
　　}   
　　return content;   
　　}   
　　public static List getBaiduSongs(){   
　　List ss = new ArrayList();   
　　String htmlContent = visitURL("http://list.mp3.baidu.com/topso/mp3topsong.html?id=1?top2","s");   
　　String encode = "GBK";   
　　//　　　　　 System.out.println("===========================================================================");   
　　//　　　　　 System.out.println(htmlContent);   
　　//　　　　　 System.out.println("===========================================================================");   
　　String reg = "(.*?)";   
　　htmlContent = getFilteredContent(htmlContent,reg,0);   
　　//FileOperUtils.writeFile("c:\\1.html", htmlContent, false);   
　　String line = "",lineurl="";   
　　Node anode = null;   
　　TextNode textnode = null;   
　　try {   
　　Parser parser = Parser.createParser(htmlContent, encode);   
　　NodeClassFilter textFilter = new NodeClassFilter(LinkTag.class);   
　　OrFilter lastFilter = new OrFilter();   
　　lastFilter.setPredicates(new NodeFilter[] { textFilter });   
　　NodeList nodeList = parser.parse(lastFilter);   
　　Node[] nodes = nodeList.toNodeArray();   
　　for (int i = 0; i < nodes.length; i++) {   
　　anode = (Node) nodes[i];   
　　if(anode instanceof LinkTag){   
　　LinkTag txt = (LinkTag)anode;   
　　line = txt.getLinkText();   
　　if(txt.getPreviousSibling()!=null){   
　　if(txt.getPreviousSibling().toString().indexOf("(")>=0)   
　　continue;   
　　}   
　　line = txt.getLinkText();   
　　lineurl = txt.getAttribute("href");   
　　//System.out.println(txt.getLink());   
　　}   
　　if (isTrimEmptyOrBlank(line)||isTrimEmptyOrBlank(lineurl))   
　　continue;   
　　ss.add(new Song(line,getSongURL(line)));   
　　}   
　　} catch (ParserException pe) {   
　　pe.printStackTrace();   
　　}   
　　return ss;   
　　}   
　　private static String getSongURL(String songname){   
　　try {   
　　String ss = URLEncoder.encode(songname,"GBK");   
　　String htmlContent = visitURL("http://so.mp3.qihoo.com/?type=0&ssrc=s&kw="+ss,"s");   
　　String encode = "GBK";   
　　http://www.feng123.com   
　　String reg = "(.*?)";　 http://www.5a520.cn   
　　htmlContent = getFilteredContent(htmlContent,reg,1);   
　　String line = "",lineurl="";   
　　Node anode = null;   
　　TextNode textnode = null;   
　　Parser parser = Parser.createParser(htmlContent, encode);   
　　NodeClassFilter textFilter = new NodeClassFilter(LinkTag.class);   
　　OrFilter lastFilter = new OrFilter();   
　　lastFilter.setPredicates(new NodeFilter[] { textFilter });   
　　NodeList nodeList = parser.parse(lastFilter);   
　　Node[] nodes = nodeList.toNodeArray();   
　　for (int i = 0; i < nodes.length; i++) {   
　　anode = (Node) nodes[i];   
　　if(anode instanceof LinkTag){   
　　LinkTag txt = (LinkTag)anode;   
　　line = txt.getLinkText();   
　　lineurl = txt.getAttribute("href");   
　　if(!isTrimEmptyOrBlank(lineurl) && lineurl.startsWith("down.html")){   
　　String s = getFilteredContent(lineurl,"u=(.*?)\\&",0);   
　　if(!s.equals("")&&s.length()>5){   
　　s = Utils.replace(s, "u=", "");   
　　s = Utils.replace(s, "&", "");   
　　s = URLDecoder.decode(s,"GBK");   
　　return s;   
　　}   
　　}   
　　}   
　　}   
　　} catch (Exception pe) {   
　　pe.printStackTrace();   
　　}   
　　return "";   
　　}   
　　public static void main(String[] args) throws Exception{   
　　List ss = getBaiduSongs();   
　　int idx = 0;   
　　for(Song s:ss){   
　　System.out.println((++idx)+":"+s.getName()+"->"+s.getUrl());   
　　}   
　　//　　　　　 String ss = getSongURL("国家");   
　　//　　　　　 System.out.println(ss);   
　　//　　　　　 String s = URLDecoder.decode("http%3A%2F%2F http://www.5a520.cn %2F%B9%FA%BC%D2.mp3","GBK");   
　　//　　　　　 System.out.println(s);   
　　}   
　　}
至此Java抓取百度Top500歌曲及源码的工作完成。
【编辑推荐】