主要的工作就是如何通过Java抓取***的Baidu好听的歌曲,Java抓取的工作主要包括3个属性:歌名、歌曲在线播放地址和歌词内容(符合LRC歌词格式),目前完成歌曲和歌曲地址抓取,由于百度的歌曲地址很多通过js获取,所以歌曲地址获取我这里使用搜狗音乐搜索方便些,所有的源码如下:
- /** *//**
- http://www.bt285.cn http://www.5a520.cn
- */
- package com.common.utils;
- import Java.io.BufferedReader;
- import java.io.ByteArrayOutputStream;
- import java.io.IOException;
- import java.io.InputStream;
- import java.io.InputStreamReader;
- import java.io.OutputStreamWriter;
- import java.io.UnsupportedEncodingException;
- import java.net.HttpURLConnection;
- import java.net.MalformedURLException;
- import java.net.URL;
- import java.net.URLConnection;
- import java.net.URLDecoder;
- import java.net.URLEncoder;
- import java.util.ArrayList;
- import java.util.HashSet;
- import java.util.List;
- import java.util.Set;
- import java.util.TreeSet;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- import org.htmlparser.Node;
- import org.htmlparser.NodeFilter;
- import org.htmlparser.Parser;
- import org.htmlparser.filters.NodeClassFilter;
- import org.htmlparser.filters.OrFilter;
- import org.htmlparser.nodes.TextNode;
- import org.htmlparser.tags.LinkTag;
- import org.htmlparser.util.NodeList;
- import org.htmlparser.util.ParserException;
- import com.common.doc.FileOperUtils;
- class Song{
- private String name;
- private String url;
- private String lrc;
- public Song(String name,String url){
- this.name = name;
- this.url = url;
- this.lrc = "";
- }
- public String getName() {
- return name;
- }
- public void setName(String name) {
- this.name = name;
- }
- public String getUrl() {
- return url;
- }
- public void setUrl(String url) {
- this.url = url;
- }
- public String getLrc() {
- return lrc;
- }
- public void setLrc(String lrc) {
- this.lrc = lrc;
- }
- }
- public class BaiduMP3 {
- public static String visitURL(String strUrl) {
- URL url = null;
- try {
- url = new URL(strUrl);
- } catch (MalformedURLException e) {
- e.printStackTrace();
- }
- URLConnection conn = null;
- try {
- conn = url.openConnection();
- conn.setDoOutput(true);
- } catch (IOException e) {
- System.out.println("e:"+e.getMessage());
- }
- OutputStreamWriter out;
- try {
- out = new OutputStreamWriter(conn.getOutputStream(), "GBK");
- out.flush();
- out.close();
- } catch (UnsupportedEncodingException e2) {
- e2.printStackTrace();
- } catch (IOException e2) {
- e2.printStackTrace();
- }
- // 接收返回信息
- BufferedReader rd = null;
- try {
- rd = new BufferedReader(
- new InputStreamReader(conn.getInputStream()));
- return rd.readLine();
- } catch (IOException e1) {
- e1.printStackTrace();
- }
- return "";
- }
- /** *//**
- * 功能说明:访问指定的URL并检查返回结果。
- * @param strUrl
- * @param successFlag 请求成功的标识,比如包含“_SUCCESS”字。
- * @return
- */
- public static String visitURL(String strUrl, String successFlag) {
- boolean rs = false;
- HttpURLConnection jconn = null;
- ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
- try {
- URL url = new URL(strUrl);
- jconn = (HttpURLConnection) url.openConnection();
- jconn.setDoOutput(true);
- jconn.setDoInput(true);
- jconn.connect();
- InputStream in = jconn.getInputStream();
- byte[] buf = new byte[4096];
- int bytesRead;
- while ((bytesRead = in.read(buf)) != -1) {
- byteArrayOutputStream.write(buf, 0, bytesRead);
- }
- String strRead = new String(byteArrayOutputStream.toByteArray(),"GBK");
- return strRead;
- } catch (MalformedURLException e) {
- e.printStackTrace();
- } catch (IOException e) {
- e.printStackTrace();
- } finally {
- jconn.disconnect();
- try {
- byteArrayOutputStream.close();
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- return "";
- }
- private static boolean isTrimEmptyOrBlank(String astr) {
- if ((null == astr) || (astr.length() == 0) || " ".equals(astr)) {
- return true;
- }
- astrastr = astr.trim();
- if ((null == astr) || (astr.length() == 0)) {
- return true;
- }
- return false;
- }
- private static String getFilteredContent(String htmlContent, String reg,int i) {
- String content = "";
- int k=1;
- Pattern pp = Pattern.compile(reg, Pattern.DOTALL);
- Matcher m = pp.matcher(htmlContent);
- while (m.find()) {
- content = m.group();
- if(k++==i)
- break;
- }
- return content;
- }
- public static List getBaiduSongs(){
- List ss = new ArrayList();
- String htmlContent = visitURL("http://list.mp3.baidu.com/topso/mp3topsong.html?id=1?top2","s");
- String encode = "GBK";
- // System.out.println("===========================================================================");
- // System.out.println(htmlContent);
- // System.out.println("===========================================================================");
- String reg = "(.*?)";
- htmlContent = getFilteredContent(htmlContent,reg,0);
- //FileOperUtils.writeFile("c:\\1.html", htmlContent, false);
- String line = "",lineurl="";
- Node anode = null;
- TextNode textnode = null;
- try {
- Parser parser = Parser.createParser(htmlContent, encode);
- NodeClassFilter textFilter = new NodeClassFilter(LinkTag.class);
- OrFilter lastFilter = new OrFilter();
- lastFilter.setPredicates(new NodeFilter[] { textFilter });
- NodeList nodeList = parser.parse(lastFilter);
- Node[] nodes = nodeList.toNodeArray();
- for (int i = 0; i < nodes.length; i++) {
- anode = (Node) nodes[i];
- if(anode instanceof LinkTag){
- LinkTag txt = (LinkTag)anode;
- line = txt.getLinkText();
- if(txt.getPreviousSibling()!=null){
- if(txt.getPreviousSibling().toString().indexOf("(")>=0)
- continue;
- }
- line = txt.getLinkText();
- lineurl = txt.getAttribute("href");
- //System.out.println(txt.getLink());
- }
- if (isTrimEmptyOrBlank(line)||isTrimEmptyOrBlank(lineurl))
- continue;
- ss.add(new Song(line,getSongURL(line)));
- }
- } catch (ParserException pe) {
- pe.printStackTrace();
- }
- return ss;
- }
- private static String getSongURL(String songname){
- try {
- String ss = URLEncoder.encode(songname,"GBK");
- String htmlContent = visitURL("http://so.mp3.qihoo.com/?type=0&ssrc=s&kw="+ss,"s");
- String encode = "GBK";
- http://www.feng123.com
- String reg = "(.*?)"; http://www.5a520.cn
- htmlContent = getFilteredContent(htmlContent,reg,1);
- String line = "",lineurl="";
- Node anode = null;
- TextNode textnode = null;
- Parser parser = Parser.createParser(htmlContent, encode);
- NodeClassFilter textFilter = new NodeClassFilter(LinkTag.class);
- OrFilter lastFilter = new OrFilter();
- lastFilter.setPredicates(new NodeFilter[] { textFilter });
- NodeList nodeList = parser.parse(lastFilter);
- Node[] nodes = nodeList.toNodeArray();
- for (int i = 0; i < nodes.length; i++) {
- anode = (Node) nodes[i];
- if(anode instanceof LinkTag){
- LinkTag txt = (LinkTag)anode;
- line = txt.getLinkText();
- lineurl = txt.getAttribute("href");
- if(!isTrimEmptyOrBlank(lineurl) && lineurl.startsWith("down.html")){
- String s = getFilteredContent(lineurl,"u=(.*?)\\&",0);
- if(!s.equals("")&&s.length()>5){
- s = Utils.replace(s, "u=", "");
- s = Utils.replace(s, "&", "");
- s = URLDecoder.decode(s,"GBK");
- return s;
- }
- }
- }
- }
- } catch (Exception pe) {
- pe.printStackTrace();
- }
- return "";
- }
- public static void main(String[] args) throws Exception{
- List ss = getBaiduSongs();
- int idx = 0;
- for(Song s:ss){
- System.out.println((++idx)+":"+s.getName()+"->"+s.getUrl());
- }
- // String ss = getSongURL("国家");
- // System.out.println(ss);
- // String s = URLDecoder.decode("http%3A%2F%2F http://www.5a520.cn %2F%B9%FA%BC%D2.mp3","GBK");
- // System.out.println(s);
- }
- }
至此Java抓取百度Top500歌曲及源码的工作完成。
【编辑推荐】