您正在浏览:主页 > 游戏新闻 > python爬虫获取基金数据
作者:雷霆之怒公益服 来源:http://www.edmi.com.cn 时间:2020-09-20 19:35
Codes=fund_code.trade_code 同样通过python爬虫得到:
3,000004,2013-03-20,江小震,债券型,中海基金,0.19亿元 (截止至:03-31)
rownum += 1
step = 10
def beginSpider():
# 获取html内容
items = items.encode('utf-8')
try:
jumpbtn = driver.find_element_by_id("pagebar").find_element_by_xpath(
print (Code)
driver.get(fund_url) # 要抓取的网页地址
# 返回
很明显URL中的519961是基金代码,那只要有所有的基金代码就可以获得所有基金历史数据了,
rownum += 1
fund_code =pd.read_table(filepath_or_buffer=file_path,sep=',')
t.start()
table = driver.find_element_by_xpath("//table[@class='w782 comm lsjz']/tbody")
# 锁住
pass
import os
items = jjcode +','+colum0 +','+ colum1 +','+ colum2 + ','+ colum3+ '\r\n'
lock.acquire()
"div[@class='pagebtns']/input[@class='pnum']") # 得到 页码文本框
import os
total_page = 20
000001,2018-03-01,1.1310,3.5420,0.89
000001,2018-02-08,1.0760,3.4870,0.94
colum3 = colum3.replace("管理人:","")
for row in tables.find_elements_by_xpath(".//li"):
000001,2018-02-22,1.1220,3.5330,2.56
col = row.find_elements(By.TAG_NAME, "a")
except:
files.write(items)
colum3 = colum3.replace("%","")
11,000014,2013-03-19,何家琪,债券型,华夏基金,1.29亿元 (截止至:03-31)
13,000016,2013-03-08,柳万军,债券型,华夏基金,4.36亿元 (截止至:03-31) 000001,2018-02-28,1.1210,3.5320,-0.53
colum2 = colum2.replace("类型:","")
# 创建锁
from threading import Thread,Lock
用sklearn分析基金数据<1>
import time
7292,750007,安信现金管理货币B
得到的数据如下共7294个:
from selenium.webdriver.common.by import Bywith open('Leixingall.txt','ab') as files:
driver = webdriver.PhantomJS(executable_path=r"C:\Program Files\phantomjs-2.1.1-windows\bin\phantomjs.exe")
fund_code =pd.read_table(filepath_or_buffer=file_path,sep=',')
lock = Lock()
JJCD = str(rownum) +','+Code+','+colum0+','+colum1+','+colum2+','+colum3+','+colum4+ '\r\n'
for Code in Codes:
file_path=os.path.join(os.getcwd(),'fund_rest.txt')
"div[@class='pagebtns']/label[text()='下一页']/preceding-sibling::label[1]").get_attribute("innerHTML")
try:
12,000015,2013-03-08,柳万军,债券型,华夏基金,13.54亿元 (截止至:03-31)
driver.get("http://fund.eastmoney.com/allfund.html") # 要抓取的网页地址
JJCD = JJCD.encode('utf-8')
except:
colum2 = colum1
total_page = initSpider(fund_url,driver)
driver.get(fund_url) # 要抓取的网页地址
colum3 = col[3].text #比率
# 下面是利用 selenium 抓取html页面的代码
jjcode = code
col = row.find_elements(By.TAG_NAME, "td")
for row in tables.find_elements_by_xpath("p"):
for x in myrange:
print (Code)
JJCD = str(rownum) +','+colum0+ '\r\n'
000001,2018-02-26,1.1370,3.5480,1.25
for Code in Codes:
lock.release()
tables = driver.find_element_by_id("code_content")
在我的上一篇文章中也用到过python获取数据只是没用bs4这类库,光用re比较费神,有时出不来有时出来不需要的内容,
000001,2018-02-27,1.1270,3.5380,-0.88
000001,2018-02-14,1.0940,3.5050,0.64
tables = driver.find_element_by_class_name("bs_gl")
用sklearn训练样本数据<4>
6,000008,2013-03-22,何如 陈正宪,联接基金,嘉实基金,8.22亿元 (截止至:03-31)
数据预处理:数据清洗、生成样本数据<3>
colum0 = colum0.replace("(","")
WebDriverWait(driver, 20).until(lambda driver: driver.find_element_by_id("pagebar").find_element_by_xpath("div[@class='pagebtns']/label[@value={0} and @class='cur']".format(x)) != None)
7293,762001,国金国鑫发起
Codes=fund_code.trade_code
Code = str(Code).zfill(6)
7286,740602,长安货币B
本次基金数据的来源为天天基金网,其中抓取某只基金历史净值数据参考了:
用模型进行预测及改进<5> colum2 = col[2].text
from selenium import webdriver
print(colum0)
000001,2018-02-13,1.0870,3.4980,0.74
pass
7285,740601,长安货币A
for t in thread_list:
getPage_text = driver.find_element_by_id("pagebar").find_element_by_xpath(
colum4 = colum4.replace(",","")
from selenium.webdriver.common.by import By
colum1 = col[1].text
tonum.clear() # 第x页 输入框
return total_page
有了所有的基金代码就可再进行循环得到基金历史净值数据了 def initSpider(fund_url,driver):
# 得到总共有多少页
python爬虫获取基金数据<2>
# 解锁
jumpbtn.click() # 点击按钮
这次用了bs4和webdriver明显方便多了。
seq,fund_code,esta_dt,fund_manager,fund_type,fund_admin,fund_scale
10,000013,2013-03-04,石大怿 刘朝阳,货币型,易方达基金,28.11亿元 (截止至:03-31)
except:
fund_url='http://fund.eastmoney.com/f10/jjjz_'+Code +'.html'
JJCD = JJCD.encode('utf-8')
000001,2018-03-06,1.1280,3.5390,0.98
colum0 = colum0.replace(")",",")
# 初始化爬虫
import pandas as pd
files.write(JJCD)
7290,750005,安信平稳增长混合发起A
数据如下:
from selenium import webdriver
Code = str(Code).zfill(6)
代码如下:
from selenium.webdriver.support.ui import WebDriverWait
import time
if total_page >20:
1,000001,2001-12-18,董阳阳,混合型,华夏基金,47.68亿元 (截止至:03-31)
colum0 = col[0].text
前面已经说了大致思路,雷霆之怒页游私服,需要用上个月所有基金的表现情况来预测这个月所有基金的表现情况,数据的获取显然是第一步,像基金这种开放的数据通过python爬虫来得到自然是容易的。
# 初始化函数
col = row.find_elements(By.TAG_NAME, "label")
for r in range_list:
9,000011,2004-08-11,陈伟彦,混合型,华夏基金,36.06亿元 (截止至:03-31)
colum4 = colum4.replace("资产规模: ","")
try:
000001,2018-02-23,1.1230,3.5340,0.09
driver = webdriver.PhantomJS(executable_path=r"C:\Program Files\phantomjs-2.1.1-windows\bin\phantomjs.exe")
pass
print('抓取完成 %s'%Code)
total_page = int("".join(filter(str.isdigit, getPage_text)))
8,000010,2013-03-04,石大怿 刘朝阳,货币型,易方达基金,256.30亿元 (截止至:03-31)
colum2 = col[2].text #总价格
colum0 = col[0].text #日期
7287,750001,安信灵活配置混合
000001,2018-02-12,1.0790,3.4900,3.15
rownum = 1
with open('allitems.txt','ab') as files:
from selenium.webdriver.common.by import By
所以首先要得到所有基金代码数据,目标URL为同样是天天基金网
files.write(JJCD)
colum1 = colum1.replace("基金经理: ","")
7288,750002,安信目标收益债券A
def getData(myrange,driver,lock,code):
colum0 = col[0].text
000001,2018-03-02,1.1210,3.5320,-0.88
colum0 = colum0.replace("成立日期:","")
colum1 = colum1.replace("*","")
with open('JJCD.txt','ab') as files:
# 抓取
000001,2018-03-05,1.1170,3.5280,-0.36
rownum = 1
fund_url='http://fund.eastmoney.com/f10/jjjz_'+Code +'.html'
可以通过total_page来控制数据的日期范围,得到的数据如下:
7284,740101,长安沪深300非周期
7289,750003,安信目标收益债券C
driver = webdriver.PhantomJS(executable_path=r"C:\Program Files\phantomjs-2.1.1-windows\bin\phantomjs.exe")
for row in table.find_elements_by_xpath(".//tr"):
7294,770001,德邦优化混合
import pandas as pd 2,000003,2013-03-20,江小震,债券型,中海基金,0.29亿元 (截止至:03-31)
thread_list = []
tonum.send_keys(str(x)) # 去第x页
r = range(1, int(total_page)+1)
time.sleep(0.1)
https://blog..net/github_26672553/article/details/78662563
"div[@class='pagebtns']/input[@class='pgo']") # 跳转到按钮
beginSpider()
def beginSpider():
000001,2018-02-09,1.0460,3.4570,-2.79
其中目标URL为
time.sleep(1)
colum3 = col[3].text
7,000009,2013-03-04,石大怿 刘朝阳,货币型,易方达基金,422.44亿元 (截止至:03-31)
t.join() # 这一步是需要的,等待线程全部执行完成
4,000005,2013-03-08,刘宁,定开债券,嘉实基金,1.37亿元 (截止至:06-22)
5,000007,2013-03-08,刘涛,债券型,鹏华基金,0.40亿元 (截止至:03-31)
7291,750006,安信现金管理货币A
file_path=os.path.join(os.getcwd(),'JJCD.txt')
到此就已经获得了所有基金的历史净值数据了,但是还需要基金的基本信息,后续会用到,比如基金的类型,是股票型还是债券型。基金的基金信息在天天基金上也有:
000001,2018-02-07,1.0660,3.4770,-2.83<<上一篇:华夏平稳增长基金:弱市中运用TIPP保险策略 >>
<<下一篇:中国第6架国产C919原型机完成首飞 >>