FAAS (aka serverless aka Amazon Lambda)
We will explore openfaas
and cover some of the openfaas tutorial
# Log into 4 lab systems then into their VM
# on the master ...
docker swarm init --advertise-addr 10.11.12.XX
# keep your token handy for the workers
# docker swarm join --token SWMTKN-1-4l54gykosxar3km60sirxgc4mgctaufuzzgbvwfig5rjxl1ff4-697ylemets05kudary7sdhign 10.11.12.201:2377
# on workers...
docker swarm leave --force
# then join
# on the master ...
curl -sL cli.openfaas.com | sudo sh # to install openFAAS CLI
git clone https://github.com/openfaas/faas
cd faas && git checkout master
./deploy_stack.sh --no-auth
docker service ls
# on the desktop, visit localhost:8080
mkdir ~/faasTutorial && cd ~/faasTutorial
# Lab 2
mkdir -p lab2 && cd lab2
faas-cli deploy -f https://raw.githubusercontent.com/openfaas/faas/master/stack.yml
# check the browser
# Play with some of the functions
# Deploy the figlet application from the store
faas-cli list
faas-cli list --verbose
faas-cli invoke markdown # command line interface to faas
echo Hi | faas-cli invoke markdown
uname -a | faas-cli invoke markdown
# Monitoring via https://grafana.com/
docker service create -d \
--name=grafana \
--publish=3000:3000 \
--network=func_functions \
stefanprodan/faas-grafana:4.6.3
# login admin/admin
http://127.0.0.1:3000/dashboard/db/openfaas
# Lab 3
cd ~/faasTutorial && mkdir -p lab3 && cd lab3
faas-cli template pull
faas-cli new --list # list available templates
faas-cli new --lang python3 hello-openfaas --prefix=""
# Hello World!!
# explore the hello-openfaas directory
# modify handler.py so that it returns "Hello OpenFaas"
faas-cli up --help
faas-cli up -f hello-openfaas.yml # build push and deploy
faas-cli build -f ./hello-openfaas.yml
faas-cli deploy -f ./hello-openfaas.yml
faas-cli invoke hello-openfaas
# function has a route...
# http://127.0.0.1:8080/function/hello-openfaas
# Astronaut Finder WONT WORK IN THE LAB!!!
faas-cli new --lang python3 astronaut-finder --prefix=""
echo 'requests' >> ./astronaut-finder/requirements.txt
# Change handler.py so that it is ...
-----------------------------------------------------------------
import requests
import random
def handle(req):
r = requests.get("http://api.open-notify.org/astros.json")
result = r.json()
index = random.randint(0, len(result["people"])-1)
name = result["people"][index]["name"]
return "%s is in space" % (name)
-----------------------------------------------------------------
faas-cli build -f ./astronaut-finder.yml
faas-cli deploy -f ./astronaut-finder.yml
echo | faas-cli invoke astronaut-finder
# Lab 4
cd ~/faasTutorial && mkdir -p lab4 && cd lab4
faas-cli new --lang python3 base64hello --prefix=""
echo 'requests' >> ./base64hello/requirements.txt
# make handler.py ...
---------------------------------------------------------------------------------------
import os
import requests
import sys
def handle(req):
"""
gateway_hostname = os.getenv("gateway_hostname", "gateway") # uses a default of "gateway" for when "gateway_hostname" is not set
test_sentence = req
"""
arg = req
# r = requests.get("http://10.11.12.201:8080/function/hello-openfaas", text=arg)
r = requests.get("http://gateway:8080/function/hello-openfaas", data=arg)
if r.status_code != 200:
sys.exit("Error with hello-openfaas, expected: %d, got: %d\n" % (200, r.status_code))
# http://docs.python-requests.org/en/master/
return r.text
---------------------------------------------------------------------------------------
faas-cli build -f ./base64hello.yml
faas-cli deploy -f ./base64hello.yml
echo | faas-cli invoke base64hello
Apache Spark (You Try It!)
Setup
# login to 4 lab systems (not VM)
scala # to check that scala is installed
mkdir -p /virtual/$USER/csc409/spark
cd /virtual/$USER/csc409/spark
scp $USER@dh2026pc01.utm.utoronto.ca:/virtual/csc409/spark/spark-2.3.2-bin-hadoop2.7.tgz .
tar -zxf spark-2.3.2-bin-hadoop2.7.tgz
cd spark-2.3.2-bin-hadoop2.7
# FOR MASTER NODE -------------------------------------------------
# Pick one node as master and
ifconfig # THIS IS THE MASTER IP
cp conf/spark-env.sh.template conf/spark-env.sh
# add lines like ...
# SPARK_MASTER_HOST="142.1.46.4" # get this from ifconfig
# SPARK_MASTER_PORT="7077"
# start the master
sbin/start-master.sh
# connect a spark-shell
bin/spark-shell --master spark://142.1.46.4:7077
# visit http://localhost:8080
# visit http://localhost:4040
# FOR SLAVE NODES (can run a slave on the master as well) ---
sbin/start-slave.sh spark://142.1.46.4:7077
Now run through the examples in last weeks lecture.
Extract!!
val textFile = sc.textFile("README.md")
# lets see the rdd...
import org.apache.spark._
val rdd = textFile.map(line => (SparkEnv.get.executorId,line))
rdd.filter(x=>x._1=="1").collect()
rdd.filter(x=>x._1=="1").count()
rdd.countByKey()
textFile.count() // Number of items in this Dataset
textFile.first() // First item in this Dataset
textFile.take(100)
val linesWithSpark = textFile.filter(line => line.contains("Spark"))
# val mean not modifiable
textFile.filter(line => line.contains("Spark")).count() // How many lines contain "Spark"?
textFile.map(line => line.split(" ").size).reduce((a, b) => if (a > b) a else b)
// access to Java API
import java.lang.Math
textFile.map(line => line.split(" ").size).reduce((a, b) => Math.max(a, b))
val words = textFile.flatMap(line => line.split(" ")).groupBy(identity)
// pull all parts of the RDD into master
words.collect()
words.count()
// wordCounts.reduce((a,b)=>a+b) // cant do that!!
words.map((x)=>1)
words.map((x)=>1).reduce((a,b)=>a+b)
linesWithSpark.cache()
linesWithSpark.count()
linesWithSpark.count()
RDD
The main abstraction Spark provides is a resilient distributed dataset (RDD), which is a collection of elements partitioned across the nodes of the cluster that can be operated on in parallel. Operations are either Transformations or Actions
import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
val data = Array(1, 2, 3, 4, 5)
val distData = sc.parallelize(data)
val distFile = sc.textFile("data.txt")
val lines = sc.textFile("data.txt")
val lineLengths = lines.map(s => s.length)
val totalLength = lineLengths.reduce((a, b) => a + b)
lineLengths.persist()
val lines = sc.textFile("data.txt")
val pairs = lines.map(s => (s, 1))
val counts = pairs.reduceByKey((a, b) => a + b)
counts.collect()
SQL Datasets and DataFrames
- A Dataset is a distributed collection of data.
- A DataFrame is a Dataset organized into named columns.
import org.apache.spark.sql.SparkSession
val spark = SparkSession .builder() .appName("Spark SQL basic example") .config("spark.some.config.option", "some-value") .getOrCreate()
// For implicit conversions like converting RDDs to DataFrames
import spark.implicits._
val df = spark.read.json("examples/src/main/resources/people.json")
df.show()
df.printSchema()
df.select("name").show()
df.select($"name", $"age" + 1).show()
df.filter($"age" > 21).show()
df.groupBy("age").count().show()
df.createOrReplaceTempView("people")
val sqlDF = spark.sql("SELECT * FROM people")
sqlDF.show()
Playing With Genomic Data
val genome = sc.textFile("/virtual/csc409/sequence2")
val c = genome.map(_=>1)
c.reduce(_+_)
val d = genome.map((_,1))
val d = genome.map(x=>x.split(" ")(3))
import org.apache.spark._
val rdd = d.map(line => (SparkEnv.get.executorId,line))
rdd.countByKey()
val e = d.map(x=>(x,1))
e.take(100)
e.reduceByKey((x,y)=>x+y)
val f = e.reduceByKey((x,y)=>x+y)
f.take(10)
f.map(line => (SparkEnv.get.executorId,line)).countByKey() // see how the data is split
f.collect()
Using Dataframes, Datasets and Spark SQL
import org.apache.spark.sql.SparkSession
val spark = SparkSession.builder().appName("Spark SQL basic example").config("spark.some.config.option", "some-value").getOrCreate()
import spark.implicits._
// val genomeds = spark.read.format("csv").option("sep", " ").option("inferSchema", "true").option("header", "true").load("/virtual/csc409/sequence2")
genomeds.printSchema()
val genomeds = spark.read.format("csv").option("sep", " ").option("inferSchema", "true").option("header", "false").load("/virtual/csc409/sequence2")
genomeds.printSchema()
genomeds.groupBy("_c3").count().show()
val result = genomeds.groupBy("_c3").count()
result.createOrReplaceTempView("v")
spark.sql("select * from v").show()
genomeds.createOrReplaceTempView("genomeds")
spark.sql("select _c3, count(*) from genomeds group by _c3").show()
References