Harvard Universitesinin EDX platfromu uzerinde Using Python for Research adli dersinden yararlanilmistir.
link: https://courses.edx.org/courses/course-v1:HarvardX+PH526x+3T2016/course/
import numpy as np # matris ve vektor islemleri icin
x = np.array([1,3,5])
x.mean() # mean bir metod. NOT: () var
x.shape # shape bir ozellik. NOT: () yok
import math
math.pi # pi sayisi
math.cos(math.pi)
math.sqrt vektorler ile calismaz
math.sqrt
np.sqrt
sq = np.sqrt
try:
print(sq([16, 9, 4]))
print(sq, "vektorler ile calisir")
except:
print(sq, "vektorler ile calismaz")
sq = math.sqrt
try:
print(sq([16, 9, 4]))
print(sq, "vektorler ile calisir")
except:
print(sq, "vektorler ile calismaz")
type("Uzay")
type(5)
type(True)
True or False
import random
random.choice([3,"A", 5, "B"])
random.choice([3,"A", 5, "B"])
A = np.random.randint(low = 20, high=30, size = (9,2))
A
B = A - 20
B
B.sum(axis=0) # satirlar uzerinden toplam
B.sum(axis=1) # sutunlar uzerinden toplam
veri paketlemek icin kullanilir. Ozellikle bir fonksiyon birden fazla deger dondurecekse, cok faydali olur, veriler paketlenip gonderilir.
x, y = 5, 7
koordinat = (x, y)
koordinat
a, b = koordinat
print(a, "->", b)
def topla_cikar(a,b):
''' toplam ve fark degerlerini tuple icinde dondur'''
toplam = a + b
fark = a - b
return (toplam, fark)
toplam, fark = topla_cikar(4,6)
print("toplam : {} fark : {}".format(toplam, fark))
def password(uzunluk):
''' password olusturur.'''
pw = ""
ch = "abcdefghijklmnopqrstuvyxwz" + "0123456789"
for i in range(uzunluk):
pw = pw + random.choice(ch)
return pw
password(6)
np.linspace(10,20,5)
x = np.linspace(-100,100,50)
y = x**2
y
import matplotlib.pyplot as plt
plt.plot(x,y)
plt.show()
x = np.linspace(-10,10,100)
y = x**2
plt.plot(y,x, label = "$x^{1/2}$")
plt.plot(x,y, label = "$x^2$")
plt.grid()
plt.legend()
plt.savefig("saved.pdf")
plt.show()
random.choice(["Yazi","Tura"])
atis = [random.choice([0,1]) for i in range(1000)]
plt.hist(atis)
plt.show()
zar = list(range(1,7))
zar
atis = [random.choice(zar) for i in range(1000)]
plt.hist(atis)
plt.show()
A = np.random.randint(low = -1, high=3, size = (2,10))
A
A = A.cumsum(axis=1) # sutun uzerinde birikimli toplam
A
orijin = np.array([[0],[0]])
A = np.hstack((orijin, A))
A
plt.plot(A[0],A[1])
plt.scatter(orijin[0],orijin[1])
plt.show()
def rassalYuruyus(geri = -1, ileri = 3, sayi = 10000, renk = "b"):
"""
sayi = 10000 adim boyunca, rastgele ileri ve geri adim atarsak ne olur?
"""
A = np.random.randint(low = geri, high=ileri, size = (2,sayi))
A = A.cumsum(axis=1) # sutun uzerinde birikimli toplam
orijin = np.array([[0],[0]])
A = np.hstack((orijin, A))
plt.plot(A[0],A[1], renk)
plt.scatter(orijin[0],orijin[1])
plt.show()
# ileri geriden daha buyuk oldugu icin genelde hep ileri gitme egilimi var
rassalYuruyus()
# geri adim ve ileri adim arasindaki farki azaltalim
rassalYuruyus(geri = -2, ileri = 3, renk = "r")
# geri adim ve ileri adim arasindaki farki azaltalim
rassalYuruyus(geri = -2, ileri = 2, renk = "r")
# geri adim ve ileri adim arasindaki farki azaltalim
x = 6
rassalYuruyus(geri = -x, ileri = x+ 1, renk = "r")
# From DNA (origin) to Amino Acids (translation)
origin = """GGTCAGAAAAAGCCCTCTCCATGTCTACTCACGATACATCCCTGAAAACCACTGAGGAAGTGGCTTTTCA
GATCATCTTGCTTTGCCAGTTTGGGGTTGGGACTTTTGCCAATGTATTTCTCTTTGTCTATAATTTCTCT
CCAATCTCGACTGGTTCTAAACAGAGGCCCAGACAAGTGATTTTAAGACACATGGCTGTGGCCAATGCCT
TAACTCTCTTCCTCACTATATTTCCAAACAACATGATGACTTTTGCTCCAATTATTCCTCAAACTGACCT
CAAATGTAAATTAGAATTCTTCACTCGCCTCGTGGCAAGAAGCACAAACTTGTGTTCAACTTGTGTTCTG
AGTATCCATCAGTTTGTCACACTTGTTCCTGTTAATTCAGGTAAAGGAATACTCAGAGCAAGTGTCACAA
ACATGGCAAGTTATTCTTGTTACAGTTGTTGGTTCTTCAGTGTCTTAAATAACATCTACATTCCAATTAA
GGTCACTGGTCCACAGTTAACAGACAATAACAATAACTCTAAAAGCAAGTTGTTCTGTTCCACTTCTGAT
TTCAGTGTAGGCATTGTCTTCTTGAGGTTTGCCCATGATGCCACATTCATGAGCATCATGGTCTGGACCA
GTGTCTCCATGGTACTTCTCCTCCATAGACATTGTCAGAGAATGCAGTACATATTCACTCTCAATCAGGA
CCCCAGGGGCCAAGCAGAGACCACAGCAACCCATACTATCCTGATGCTGGTAGTCACATTTGTTGGCTTT
TATCTTCTAAGTCTTATTTGTATCATCTTTTACACCTATTTTATATATTCTCATCATTCCCTGAGGCATT
GCAATGACATTTTGGTTTCGGGTTTCCCTACAATTTCTCCTTTACTGTTGACCTTCAGAGACCCTAAGGG
TCCTTGTTCTGTGTTCTTCAACTGTTGAAAGCCAGAGTCACTAAAAATGCCAAACACAGAAGACAGCTTT
GCTAATACCATTAAATACTTTATTCCATAAATATGTTTTTAAAAGCTTGTATGAACAAGGTATGGTGCTC
ACTGCTATACTTATAAAAGAGTAAGGTTATAATCACTTGTTGATATGAAAAGATTTCTGGTTGGAATCTG
ATTGAAACAGTGAGTTATTCACCACCCTCCATTCTCT"""
translation="""MSTHDTSLKTTEEVAFQIILLCQFGVGTFANVFLFVYNFSPIST
GSKQRPRQVILRHMAVANALTLFLTIFPNNMMTFAPIIPQTDLKCKLEFFTRLVARST
NLCSTCVLSIHQFVTLVPVNSGKGILRASVTNMASYSCYSCWFFSVLNNIYIPIKVTG
PQLTDNNNNSKSKLFCSTSDFSVGIVFLRFAHDATFMSIMVWTSVSMVLLLHRHCQRM
QYIFTLNQDPRGQAETTATHTILMLVVTFVGFYLLSLICIIFYTYFIYSHHSLRHCND
ILVSGFPTISPLLLTFRDPKGPCSVFFNC"""
translation[:4]
# there are some invisible "\n" line breaks, get rid of them
origin = origin.replace("\n","")
origin = origin.replace("\r","")
origin = origin[20:938]
translation = translation.replace("\n","")
translation = translation.replace("\r","")
# Dictionary (lookup table) from DNA to AminoAcids
table = {
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
'TAC':'Y', 'TAT':'Y', 'TAA':'_', 'TAG':'_',
'TGC':'C', 'TGT':'C', 'TGA':'_', 'TGG':'W',
}
table["ATA"]
k = 0
triple = origin[k:(k+3)]
print(triple, "->", table[triple])
mytranslation = ""
L = int(len(origin) / 3)
for i in range(L):
k = 3 * i
triple = origin[k:(k+3)]
mytranslation += table[triple]
mytranslation
# bizim cevirimizdeki son karakter haric tutarak, asil ceviri ile karsilatiralim
translation == mytranslation[:-1]
Cekilis sonuclarini yukairdakine benzer bir bicimde sorgulayan bir sistem yapabilir misiniz?
Sorgu sistemi: http://www.millipiyango.gov.tr (Tum listeyi tiklayin)
Gutenberg Projesi kapsaminda Ingilizce ve Fransizca bazi kitaplari inceleyecegiz. Kitaplari asagidaki linkten indirebilirsiniz.
Ama once biraz alistirma yapalim.
pierces_lyrics = """We are stars,
Fashioned in the flesh and bone,
We are islands,
Excuses to remain alone,
We are moons,
Throw ourselves around each other,
We are oceans,
Being controlled by the pull of another.
"""
text = pierces_lyrics
pierces_lyrics
pierces_lyrics = pierces_lyrics.replace(","," ")
pierces_lyrics = pierces_lyrics.replace("\n"," ")
pierces_lyrics
pierces_lyrics = pierces_lyrics.lower()
pierces_lyrics
pierces_lyrics.split(" ")
lyrics = {}
for word in pierces_lyrics.split(" "):
if word == "": continue
lyrics.setdefault(word, 0) #unseen word default value
lyrics[word] += 1
lyrics
def getDictionary(text):
text = text.lower()
skips = [".", ",",";",":", "?", "'",'"', "\n"]
for s in skips:
text = text.replace(s," ")
mydict = {}
for word in text.split(" "):
if word == "": continue
mydict.setdefault(word, 0) #unseen word default value
mydict[word] += 1
return mydict
getDictionary(text)
getDictionary(text).values()
lyrics == getDictionary(text)
def read_book(path):
"""
Read a book and return it as a string
"""
with open(path, "r", encoding="utf8") as f:
text = f.read()
text = text.replace("\n","").replace("\r","")
return text
title_path = "Books_EngFr/English/shakespeare/Romeo and Juliet.txt"
book = read_book(title_path)
len(book)
ind = book.find("What's in a name?")
ind
quote = book[ind:(ind+100)]
quote
def word_stats(word_counts):
""" return number of unique words and word frequencies"""
num_unique = len(word_counts)
counts = word_counts.values()
return (num_unique, counts)
title_path = "Books_EngFr/English/shakespeare/Romeo and Juliet.txt"
book = read_book(title_path)
word_counts = getDictionary(book)
(num_unique, counts) = word_stats(word_counts)
num_unique
sum(counts)
import os
book_dir = "Books_EngFr"
for language in os.listdir(book_dir):
if language.startswith("."): continue # hidden file
for author in os.listdir(book_dir + "/" + language):
if author.startswith("."): continue # hidden file
for title in os.listdir(book_dir + "/" + language + "/" + author):
if title.startswith("."): continue # hidden file
inputfile = book_dir + "/" + language + "/" + author + "/" + title
print(inputfile)
book = read_book(inputfile)
(num_unique, counts) = word_stats(getDictionary(book))
print(num_unique)
import pandas as pd
table = pd.DataFrame(columns = ("name", "age"))
table.loc["1st"] = "James", 22
table.loc["2nd"] = "Jess", 23
table
import os
book_dir = "Books_EngFr"
stats = pd.DataFrame(columns = ("language","author", "title", "length", "unique"))
title_num = 1
for language in os.listdir(book_dir):
if language.startswith("."): continue # hidden file
for author in os.listdir(book_dir + "/" + language):
if author.startswith("."): continue # hidden file
for title in os.listdir(book_dir + "/" + language + "/" + author):
if title.startswith("."): continue # hidden file
inputfile = book_dir + "/" + language + "/" + author + "/" + title
book = read_book(inputfile)
(num_unique, counts) = word_stats(getDictionary(book))
stats.loc[title_num] = language, author, title, sum(counts), num_unique
title_num += 1
stats
# yeni bir sutun ekleyelim
stats["ratio"] = stats["unique"] / stats["length"]
stats.sort_values("ratio") # ratio ya gore siralayalim
import matplotlib.pyplot as plt
plt.plot(stats.length, stats.unique, "bo")
plt.show()
plt.figure(figsize=(10,10))
subset = stats[stats.language == "English"]
plt.loglog(subset.length, subset.unique, "bo", label = "English")
subset = stats[stats.language == "French"]
plt.loglog(subset.length, subset.unique, "ro", label = "French")
plt.xlabel("Length of Book")
plt.ylabel("Number of Unique Words")
plt.legend()
plt.show()