Monday, 27 October 2014

Bing Search Using Python

from bs4 import BeautifulSoup
import urllib2
class Bing:
def __init__(self):
self.__bing_url = "http://www.bing.com/search?q=%s&first=%s"
def search(self, _s_search):
i_page = 1
urls = []
while True:
try:
i_len_urls = len(urls)
response = urllib2.urlopen(self.__bing_url % (_s_search, i_page))
parsed_response = BeautifulSoup(response)
for h in parsed_response.findAll("h2"):
if h.a != None:
s_url = h.a['href']
if s_url in urls:
continue
elif s_url.startswith("http://") or s_url.startswith("https://") or s_url.startswith("ftp://"):
urls.append(s_url)
if len(urls) <= i_len_urls:
break
i_page = i_page + 10
except:
pass
return urls
b = Bing()
urls = b.search("bing")
print len(urls)
print '\n'.join(urls)
view raw bing.py hosted with ❤ by GitHub


Partial output of the above Python script is shown below:

194
http://www.bing.com/
http://en.wikipedia.org/wiki/Bing
http://www.facebook.com/Bing
https://twitter.com/Bing
http://www.bingtoolbar.com/en-US
http://www.youtube.com/user/bing
http://dictionary.reference.com/browse/bing
https://www.bingplaces.com/
http://www.thefreedictionary.com/Bing
http://43007555.r.msn.com/?ld=Dv_gnZ2ILAtTZR03P_3NWe4zVUCUyjVAbTHjJXdVp5pSJIfJyHoemQp9Uv_Wg6SZdsdEiXBfrZKs_SwYbqptHU6UyPdLC3XaylUED9ff_6c2EI5qNivG3i7FrVBH9_t3zJWhM63Utu8WncH9sAtJdZqlrMdY0&u=WindowFixed.com%2fone.php%3fremove%3dBing+Toolbar
http://3228102.r.msn.com/?ld=DvyB-NuRzghUJI5dayAAZZAzVUCUwcmn0K1FdQtdfmLCMQPUz3gc2BgUvJCHT8SlQgGQSrDGzPO89LJdfVC7Mg0Lnx22SO5JX7z0QT-CucKVea58oXWY5s-qrEpTLwMJKuUHNDA_udfLkpaKdohpE6EwelEPY&u=http%3a%2f%2fwww.ask.com%2fslp%3f%26q%3dwhat%2bis%2bbing%26sid%3d0c053e1c-66a9-4f01-ae7d-401fd0f4370f-0-us_msb%26kwid%3dbing%26cid%3d5787895127
http://advertise.bingads.microsoft.com/en-us/home
http://blogs.bing.com/webmaster/?p=8413
http://blogs.bing.com/
http://hk.bing.com/
http://advertise.bingads.microsoft.com/en-us/sign-up
http://en.wikipedia.org/wiki/Bing_(company)
http://www.microsoft.com/privacystatement/en-gb/bing/default.aspx
http://msdn.microsoft.com/en-us/library/dd877956.aspx
http://43007555.r.msn.com/?ld=d3odkhGgduicmifMUqm19BRTVUCUxkRtc4JCtSzSlX8koKvNTxhK6ZCc0xg7F2lL1VzUHlg0d091QCJzID_AFXhoaYV_qheV-DwL010iIeyGhkidZmY4BbDgkbsV4S7Y02EUygVci2nzFRJXxoML4rBHKB5GM&u=WindowFixed.com%2fone.php%3fremove%3dBing+Toolbar
https://addons.mozilla.org/en-US/firefox/addon/bing/
http://www.microsoft.com/maps/
https://www.bingmapsportal.com/
http://www.bingiton.com/
https://itunes.apple.com/us/app/bing/id345323231
http://www.merriam-webster.com/dictionary/bing
http://bing.en.softonic.com/
view raw bing.txt hosted with ❤ by GitHub

Sunday, 5 October 2014

Calculation of Beta of Stocks Using Python Libraries (Stock Risk Analysis)

As an example, let us consider Coca Cola (NYSE:KO). Historical Coca Cola stock data can be downloaded from Google Finance:
Historical NYSE:KO Data

Suppose we consider NYSE:SPY as the market indicator/index in calculating beta. Historical stock data of NYSE:SPY can be downloaded from Google Finance:
Historical NYSE:SPY Data

The following python script finds the beta value for market indicator and symbol historical data file names passed as command line parameters:

import numpy as np
from sklearn import datasets, linear_model
import sys
fh = open(sys.argv[1], 'r')
lines = fh.readlines()
fh.close()
market_x = []
for i in range(len(lines)-1):
if i==0:
continue
line_i = lines[i].strip().split(',')[4]
line_i_1 = lines[i+1].strip().split(',')[4]
rate = (float(line_i) - float(line_i_1))/(float(line_i_1))
market_x.append([rate])
fh = open(sys.argv[2], 'r')
lines = fh.readlines()
fh.close()
stock_y = []
for i in range(len(lines)-1):
if i==0:
continue
line_i = lines[i].strip().split(',')[4]
line_i_1 = lines[i+1].strip().split(',')[4]
rate = (float(line_i) - float(line_i_1))/(float(line_i_1))
stock_y.append([rate])
regr = linear_model.LinearRegression()
regr.fit(market_x, stock_y)
print 'Beta: %s' % regr.coef_[0][0]
view raw beta_cal.py hosted with ❤ by GitHub

Saturday, 30 August 2014

Stock Price/Volume Analysis Using Python and PyCluster

In this blog post we will be looking at how k-means (http://en.wikipedia.org/wiki/K-means_clustering) cluster analysis can be used to create clusters of (price, volume) data of stocks. 

The following python script can be used to create clusters. The input is trading date, close price and volume obtained from a comma separated file. The number of clusters can be set at the time of execution of the script. Furthermore, in this specific example, we will be clustering the data into 2/3/4/5 clusters. Also, note that, if there are less than a specified percentage of points within a cluster, we believe these points maybe a result of some extraordinary events related to that particular stock (outlier). This percentage largely depends on the number of data points and the number of clusters.

import numpy as np
import sys
import Pycluster
import matplotlib.pyplot as plt
from scipy import stats
fh = open(sys.argv[1], 'r')
lines = fh.readlines()
fh.close()
clusters = int(sys.argv[2])
max_points_pc = 5
points = []
points_r = []
dates = []
volumes = []
close_prices = []
for i in range(len(lines)):
if i <= 1:
continue
line_c = lines[i-1].strip().split(',')
close_price = float(line_c[4])
volume = float(line_c[5])
points_r.append((close_price, volume))
volumes.append(volume)
close_prices.append(close_price)
dates.append(line_c[0])
a = np.array(volumes)
volume_z = stats.zscore(a)
a = np.array(close_prices)
close_price_z = stats.zscore(a)
points = zip(close_price_z, volume_z)
labels, error, nfound = Pycluster.kcluster(points, clusters)
x = []
y = []
d = []
for i in range(clusters):
x.append([])
y.append([])
d.append([])
for i in range(len(points_r)):
index = labels[i]
x[index].append(points_r[i][0])
y[index].append(points_r[i][1])
d[index].append(dates[i])
for i in range(clusters):
plt.plot(x[i], y[i], 'o')
for j in range(len(x[i])):
if len(x[i]) <= max_points_pc * len(points) / 100:
plt.annotate(d[i][j], (x[i][j], y[i][j]))
plt.xlabel('Close Price')
plt.ylabel('Volume')
plt.grid()
plt.show()


The following figure shows the output of the above Python script with 2 clusters:



The following figure shows the output of the above Python script with 3 clusters:





The following figure shows the output of the above Python script with 4 clusters:

The following figure shows the output of the above Python script with 5 clusters:

Wednesday, 18 June 2014

Print HTTP Response Header: Python

When you make an HTTP request to retrieve a webpage you get an HTTP response back. This response includes the usual response body that we see in the web browsers and an HTTP header, which is usually not shown in the web browsers. But this HTTP header may give some useful insight about the web page, web server, cookies, etc. 


The following simple python script prints the HTTP response header for the URL passed as a command line parameter:

import urllib2
import sys
def print_http_respose_header(url):
try:
response = urllib2.urlopen(url)
for key, value in response.info().items():
print key + ' => ' + value
except:
print 'error message'
def main():
print_http_respose_header(sys.argv[1])
if __name__ == '__main__':
main()


Output of the above script, with http://python.org as the command line parameter,  is shown below:

content-length => 45495
via => 1.1 varnish
x-cache => HIT
accept-ranges => bytes
strict-transport-security => max-age=63072000; includeSubDomains
vary => Cookie
server => nginx
connection => close
x-served-by => cache-sv95-SJC3
x-cache-hits => 19
date => Wed, 18 Jun 2014 16:56:04 GMT
x-frame-options => SAMEORIGIN
content-type => text/html; charset=utf-8
age => 863

Monday, 5 May 2014

Compare HTML Pages: HTML Tags Counter

There are many instances in which we would want to compare HTML templates programmatically. One of the simplest methods or one of the factors that could be used to rule out the similarity of two HTML pages is to count the number of HTML tags in those pages and compare the same.

The following Python script helps to get the HTML tag count of an HTML webpage given a URL:

import lxml.html
import urllib2
def proc_root(root, tag_count):
try:
tag_count[root.tag] += 1
except:
tag_count[root.tag] = 1
for child in root:
proc_root(child, tag_count)
return tag_count
def get_tag_count(url):
tag_count = {}
res = urllib2.urlopen(url).read()
root = lxml.html.fromstring(res)
proc_root(root, tag_count)
return tag_count
def main():
url = 'http://www.google.com'
tag_count = get_tag_count(url)
for tag, count in tag_count.items():
print '%s\t%s' % (tag, count)
if __name__ == "__main__":
main()





A sample output of the above script is given below:

meta 1
table 1
font 1
style 2
span 8
script 5
tr 1
html 1
input 7
td 3
body 1
head 1
form 1
nobr 2
br 6
a 20
b 1
center 1
textarea 1
title 1
p 1
u 1
div 17

Sunday, 4 May 2014

An Extremely Simple and Effective Web Crawler in Python

Web crawlers also known as web spiders are used in retrieving web links/pages by following links starting from the seed/initial web page. Crawlers are widely used in building search engines. The retrieved links/pages have numerous applications. 

Primary functions/operations of web link crawlers are:

1. Retrieve seed web page
2. Extract all valid URLs/links
3. Visit every link extracted in step 2. 
4. Stop if depth has reached maximum depth. 

The following Python script is an extremely simple and effective web crawler! It can be configured to use different seed URLs and also different depth.

import re, urllib
from urlparse import urlparse
depth_max = 1 #determines the maximum depth the crawler goes from the seed website
urls = [] #stores the unique urls retrieved while crawling
url_visited = [] #stores the urls of web pages visited
#clean url; add protocol and host if absent; remove query section
def getCleanURL(_cURL, _baseHost):
try:
oURL = urlparse(_cURL)
except:
return None
if oURL.scheme == '':
scheme = 'http'
else:
scheme = oURL.scheme
if oURL.netloc == '':
host = _baseHost
else:
host = oURL.netloc
url = scheme + '://' + host + oURL.path
return url
def crawl(_baseURL, fh, _depth=0):
oBase = urlparse(_baseURL)
baseHost = oBase.netloc
if _depth > depth_max: #if depth exceed maximum depth stop crawling further
return
elif _baseURL in url_visited: #web page already visited
return
try:
res = urllib.urlopen(_baseURL).read()
url_visited.append(_baseURL)
except: #error visiting url/web page
return
res = res.replace('\n', '')
for url in re.findall('''href=["'](/[^"']+)["']''', res, re.I):
url = getCleanURL(url, baseHost)
if url not in urls and url != None: #check url validity and uniqueness
urls.append(url)
fh.write(url + '\n')
for url in urls:
crawl(url, fh, _depth+1)
def main():
seed_url = 'http://www.cnn.com'
fh_urls = open('urls.txt','w')
crawl(seed_url, fh_urls)
fh_urls.close()
if __name__ == "__main__":
main()
view raw simple_crawl.py hosted with ❤ by GitHub





A partial output of the above script with http://www.cnn.com as the seed URL is given below:

http://www.cnn.com/tools/search/cnncom.xml
http://www.cnn.com/tools/search/cnncomvideo.xml
http://www.cnn.com/CNN/Programs
http://www.cnn.com/cnn/programs/
http://www.cnn.com/cnni/
http://www.cnn.com/video/
http://www.cnn.com/trends/
http://www.cnn.com/US/
http://www.cnn.com/WORLD/
http://www.cnn.com/POLITICS/
http://www.cnn.com/JUSTICE/
http://www.cnn.com/SHOWBIZ/
http://www.cnn.com/TECH/
http://www.cnn.com/HEALTH/
http://www.cnn.com/LIVING/
http://www.cnn.com/TRAVEL/
http://www.cnn.com/OPINION/
http://www.cnn.com/2014/05/04/us/circus-accident-rhode-island/index.html
http://www.cnn.com/2014/05/03/politics/washington-correspondents-dinner/index.html
http://www.cnn.com/2014/05/04/world/europe/ukraine-crisis/index.html
http://www.cnn.com/2014/05/04/us/clippers-shelly-sterling/index.html
http://www.cnn.com/2014/05/04/us/rocky-top-tennessee/index.html
http://www.cnn.com/2014/05/04/us/condoleeza-rice-rutgers-protest

Saturday, 19 April 2014

Typo Squatting: Beware of Typos in Domain Names/URLs

Typo Squatting(TypoSquatting) is a form of cyber squatting which is based on the errors made by users when they enter a web address in a browser. This generally happens with very popular websites which are accessed by a large number of users.

Typosquatters are the people who register such typo websites generally with malicious intent. These typo websites may lead to either Parked Domains, Phishing Websites or Malicious Websites.

Typos are very common and hence typosquatters register a large number of domain names that are typos of popular websites. When a user makes an error while trying to access a website may be lead to a different website than the actual one the use intended to visit. This can result in one of many unpredictable consequences. Hence, TypoSquatting is an important topic in the web/cyber security research/industry. Also, from the internet user's point of view, it is vital to ensure that maximum care is taken in avoiding typos.

There are many reasons/actions which result in typos when entering a web address in the browser or when entering any form of text using a regular keyboard. Among those, two common forms of Typos are:

1. Character Omission Typo: This occurs when a user misses a character while entering a URL. For example, if the user enters http://www.bloger.com while intending to visit http://www.blogger.com, this results in a character omission typo.
2. Character Swap Typo: This occurs when a user accidentally swaps two adjacent characters in a web address. Considering a similar example, suppose the user enters http://www.bolgger.com while intending to visit http://www.blogger.com, this results in a character swap typo.




You may use the following Python script to analyze the type and number of typos that may result when trying to access a website. Note that, the following script takes the domain name as input. That is, if the user is trying to access http://www.blogger.com, blogger.com is called the domain name.

from sets import Set
def add_missing_char_typo(domain_name):
typo_domains = Set()
domain_chars = list(domain_name)
for i in range(len(domain_chars)):
domain_chars_t = list(domain_chars)
if domain_chars_t[i] == '.':
break
domain_chars_t[i] = ''
typo_domain = ''.join(domain_chars_t)
typo_domains.add(typo_domain)
return typo_domains
def add_swap_char_typo(domain_name):
typo_domains = Set()
for i in range(len(domain_name) - 1):
domain_chars_t = list(domain_name)
c_i = domain_chars_t[i]
c_i_1 = domain_chars_t[i+1]
if c_i == '.' or c_i_1 == '.':
break
elif c_i == c_i_1:
continue
domain_chars_t[i] = c_i_1
domain_chars_t[i+1] = c_i
typo_domain = ''.join(domain_chars_t)
typo_domains.add(typo_domain)
return typo_domains
td1 = add_missing_char_typo('blogger.com')
td2 = add_swap_char_typo('blogger.com')
print 'Missed Character Typo: ' + str(td1)
print 'Swapped Character Typo: ' + str(td2)
view raw typos_gen.py hosted with ❤ by GitHub


Output of the above script is given below:

Missed Character Typo: Set(['bloggr.com', 'bloger.com', 'blogge.com', 'bogger.com', 'blgger.com', 'logger.com'])
Swapped Character Typo: Set(['blgoger.com', 'blogegr.com', 'bolgger.com', 'lbogger.com', 'bloggre.com'])