建什么网站比较好,网站访客qq获取代码,建筑工程教育网官方网站,巩义网站优化我常使用C语言写网络爬虫#xff0c;能够将网页爬出来#xff0c;但是#xff0c;图片却爬不出来#xff0c;有没有大佬帮解决一下#xff01;#xff01;#xff01; 代码#xff1a;
#include stdio.h
#includestring.h
#includeWinSock2.h…我常使用C语言写网络爬虫能够将网页爬出来但是图片却爬不出来有没有大佬帮解决一下 代码
#include stdio.h
#includestring.h
#includeWinSock2.h
#pragma comment(lib,ws2_32.lib)
/*
网络部分:http url
url 三部分
https://www.baidu.com/?tn62095104_29_oem_dgch6
1.协议 http 超文本传输协议
2.主机名 www.baidu.com 需要的ip地址 240e:ff:e020:966:0:ff:b042:f296
3.资源名 /?tn62095104_29_oem_dgch6*/void parseUrl(const char* url, char* host, char* resPath);
void getImgUrl(const char* html, char* imgUrl);typedef struct Spider
{char host[128]; //主机名char resPath[128]; //资源路径SOCKET fd;
}Spider;
//获取资源
void spider_init(Spider* spider, const char* url)
{memset(spider-host, 0, sizeof(spider-host));memset(spider-resPath, 0, sizeof(spider-host));parseUrl(url, spider-host, spider-resPath);
}
//连接到服务器:网络编程
void spider_connect(Spider* spider)
{//打开socket 2.2 确定买什么手机WSADATA wsadata;WSAStartup(MAKEWORD(2, 2), wsadata);//创建socket 去买手机spider-fd socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);if (spider-fd SOCKET_ERROR){printf(create socket falied %d\n, WSAGetLastError());return;}//通过域名获取ip地址HOSTENT* hent gethostbyname(spider-host);if (!hent){printf(get host ip failed %d\n, WSAGetLastError());return;}//链接服务器SOCKADDR_IN addr;addr.sin_family AF_INET;addr.sin_port htons(80); //端口号 http:80 端序memcpy(addr.sin_addr, hent-h_addr, sizeof(IN_ADDR));if (SOCKET_ERROR connect(spider-fd, addr, sizeof(addr))){printf(connect falied %d\n, WSAGetLastError());return;}
}//解析域名
void parseUrl(const char* url, char* host, char* resPath)
{if (!url)return;//https://www.baidu.com/?tn62095104_29_oem_dgch6// www.baidu.com/?tn62095104_29_oem_dgch6const char* ph strstr(url, //);ph ph ? ph 2 : url;//简写puts(ph);const char* pp strstr(ph, /);if (!pp){strcpy(host, ph);strcpy(resPath, /); //index.html}else{//先获取hoststrncpy(host, ph, pp - ph);//再获取resapathstrcpy(resPath, pp);}
}
//获取网页
void getHtml(Spider* spider)
{//连接到服务器spider_connect(spider);//给服务器发送请求 char header[128] { 0 };sprintf(header, GET %s HTTP/1.1\r\n, spider-resPath);sprintf(header strlen(header), Host:%s\r\n, spider-host);strcat(header, Connection:close\r\n);strcat(header, \r\n);if (SOCKET_ERROR send(spider-fd, header, strlen(header), 0)) {printf(send failed %d\n, WSAGetLastError());return;}char html[1024 * 5] { 0 };//获取网页int len recv(spider-fd, html, 1024 * 5, 0);if (len SOCKET_ERROR) {printf(recv failed %d\n, WSAGetLastError());}else{//保存到文件FILE* fp fopen(maye.html, w);{if (!fp)return;}fwrite(html, sizeof(char), strlen(html), fp);fclose(fp);printf(%s\n, html);}char imgUrl[128] { 0 };getImgUrl(html, imgUrl); // 假设 getImgUrl 函数已经定义...Spider sp;spider_init(sp, imgUrl); // 初始化 Spider 对象spider_connect(sp);sprintf(header, GET %s HTTP/1.1\r\n, sp.resPath);sprintf(header strlen(header), Host:%s\r\n, sp.host);strcat(header, Connection:close\r\n);strcat(header, Content-Type: image/jpeg\r\n);strcat(header, \r\n);puts(header);if (SOCKET_ERROR send(spider-fd, header, strlen(header), 0)) {printf(send failed %d\n, WSAGetLastError());return;}//获取图片char recvBuf[1024] { 0 };len recv(sp.fd, recvBuf, 1023, 0);//查找有没有\r\n\r\nchar* psp strstr(recvBuf, \r\n\r\n);if (!psp)return;psp sizeof(\r\n\r\n);//接收图片数据FILE* fp fopen(hello.png, wb);fwrite(psp, sizeof(char), len - (psp - recvBuf), fp);fclose(fp);//继续接受没有接受完毕的while (1){len recv(sp.fd, recvBuf, 1023, 0);if (len 0){break;}else{fwrite(recvBuf, sizeof(char), len, fp);}}fclose(fp);printf(%s\n, imgUrl);
}
//获取网页中的图片链接
void getImgUrl(const char* html, char* imgUrl)
{if (!html || !imgUrl)return;char* beg strstr(html, img src\);if (!beg){return;}else{printf(\n\n\n\n\n\n\n\n\n\n\n\n\n\n);//puts(beg10);beg 10;}//找结尾的双引号char* end strstr(beg, \);if (!end){printf(网页错误\n);}else{strncpy(imgUrl, beg, end - beg);}
}//下载图片int main()
{printf(请输入要爬取的网址);char url[512] http://www.netbian.com;//gets_s(url, 128);puts(url);Spider sp;spider_init(sp, url);printf(Host:%s resPath:%s\n, sp.host, sp.resPath);getHtml(sp);getchar();return 0;
}