举例: 在rottenTomatoes上下载电影评价.link 在某位影评人网站上下载评价.link Word cloud 插件.link
Flat File Structure.一种文件格式:Flat files contain tabular data in plain text format with one data record per line and each record or line having one or more fields. These fields are separated by delimiters, like commas, tabs, or colons.
# Make directory if it doesn't already exist folder_name = 'theFolderName' ifnot os.path.exists(folder_name): os.makedirs(folder_name)
url = 'the_adress' response = requests.get(url)
response是200就是成功的意思. 现在东西在response里,将其导出.
1 2 3
withopen(os.path.join(folder_name, url.split('/')[-1]), mode='wb') as file: file.write(response.content)
check content.
1
os.listdir(folder_name)
glob
用glob读取类似txt的文件.
用glob.glob读取一个文件夹下的内容.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
import glob import pandas as pd
# List of dictionaries to build file by file and later convert to a DataFrame df_list = [] for ebert_review in glob.glob('ebert_reviews/*.txt'): withopen(ebert_review, encoding='utf-8') as file: # 注意这里的encoding需要在原网页文件下查看 title = file.readline()[:-1] # readline来取一行一行的内容,最后的-1是用于减去newline character的 review_url = file.readline()[:-1] review_text = file.read()
# Append to list of dictionaries df_list.append({'title': title, 'review_url': review_url, 'review_text': review_text}) df = pd.DataFrame(df_list, columns = ['title', 'review_url', 'review_text']) # 最后转成df
withopen('fileName.html') as file: soup = BeautifulSoup(file, 'lxml') # first we make a soup # in the soup we have everything in the html file without clean
# 然后用find来寻找某个我们想要的tag内的内容. soup.find('title') # return everything with tag title soup.find('title').content[0] # return第一个在title tag内的内容. soup.find('title').content[0][:-len(' the thing wanna delete')] # 进行string的删减.
# List of dictionaries to build file by file and later convert to a DataFrame df_list = [] folder = 'rt_html' for movie_html in os.listdir(folder): withopen(os.path.join(folder, movie_html)) as file: soup = BeautifulSoup(file, 'lxml') title = soup.find('title').contents[0][:-len(' - Rotten rottenTomatoes')] audience_score = soup.find('div', class_='audience-score meter').find('span').contents[0][:-1] num_audience_ratings = soup.find('div', class_='audience-info hidden-xs superPageFontColor') num_audience_ratings = num_audience_ratings.find_all('div')[1].contents[2].strip().replace(',','') # Append to list of dictionaries df_list.append({'title': title, 'audience_score': int(audience_score), 'number_of_audience_ratings': int(num_audience_ratings)}) df = pd.DataFrame(df_list, columns = ['title', 'audience_score', 'number_of_audience_ratings'])
# List of dictionaries to build and convert to a DataFrame later df_list = [] image_errors = {} for title in title_list: # title list是自己创造好的一个list of string try: # This cell is slow so print ranking to gauge time remaining ranking = title_list.index(title) + 1 print(ranking) page = wptools.page(title, silent=True) images = page.get().data['image'] # First image is usually the poster first_image_url = images[0]['url'] r = requests.get(first_image_url) # Download movie poster image i = Image.open(BytesIO(r.content)) image_file_format = first_image_url.split('.')[-1] i.save(folder_name + "/" + str(ranking) + "_" + title + '.' + image_file_format) # Append to list of dictionaries df_list.append({'ranking': int(ranking), 'title': title, 'poster_url': first_image_url})
# Not best practice to catch all exceptions but fine for this short script except Exception as e: print(str(ranking) + "_" + title + ": " + str(e)) image_errors[str(ranking) + "_" + title] = images
# 用try保护,最后查看出错的图片 for key in image_errors.keys(): print(key)
from sqlalchemy import create_engine # Create SQLAlchemy Engine and empty bestofrt database # bestofrt.db will not show up in the Jupyter Notebook dashboard yet engine = create_engine('sqlite:///bestofrt.db')
# Store cleaned master DataFrame ('df') in a table called master in bestofrt.db # bestofrt.db will be visible now in the Jupyter Notebook dashboard df.to_sql('master', engine, index=False)
# [a-zA-Z] to signify emails in this dataset all start and end with letters patients_clean['email'] = patients_clean.contact.str.extract('([a-zA-Z][a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+[a-zA-Z])', expand=True)
# Note: axis=1 denotes that we are referring to a column, not a row patients_clean = patients_clean.drop('contact', axis=1)
# Mapping from full state name to abbreviation state_abbrev = {'California': 'CA', 'New York': 'NY', 'Illinois': 'IL', 'Florida': 'FL', 'Nebraska': 'NE'}
# Function to apply defabbreviate_state(patient): if patient['state'] in state_abbrev.keys(): abbrev = state_abbrev[patient['state']] return abbrev else: return patient['state']
# To category patients_clean.assigned_sex = patients_clean.assigned_sex.astype('category') patients_clean.state = patients_clean.state.astype('category')
# To datetime patients_clean.birthdate = pd.to_datetime(patients_clean.birthdate)
# Strip u and to integer treatments_clean.dose_start = treatments_clean.dose_start.str.strip('u').astype(int) treatments_clean.dose_end = treatments_clean.dose_end.str.strip('u').astype(int)
使用pad填充string.
1 2
# Strip all " ", "-", "(", ")", and "+" and store each number without any formatting. Pad the phone number with a 1 if the length of the number is 10 digits (we want country code). patients_clean.phone_number = patients_clean.phone_number.str.replace(r'\D+', '').str.pad(11, fillchar='1')
# Patient ID should be the only duplicate column all_columns = pd.Series(list(patients_clean) + list(treatments_clean)) # patients_clean是df. all_columns[all_columns.duplicated()]