Generate markup file from html file source.
Generate markup file from html file source.
Args are : –url: give online page url , –source: offline html file e.g. index.html
import os
import sys, argparse
import requests
from bs4 import BeautifulSoup, Tag
def generate_md_file_from_html(url=None, source=None):
out_file_name = 'tmp_out.md'
last_file = 'out.md'
os.system (f'rm -rf {out_file_name}')
os.system (f'rm -rf {last_file}')
tmp_soup = ''
if url is not None:
r = requests.get (url)
tmp_soup = r.text
tmp_soup = BeautifulSoup (tmp_soup, 'html.parser')
elif source is not None:
with open (source) as fp:
tmp_soup = BeautifulSoup (fp, 'html.parser')
else:
raise 'Error getting source html'
if tmp_soup == '':
raise 'Error getting source html'
def walker(soup):
if soup.name is not None:
for child in soup.children:
# process node
# print( str (child.name) + ":" + str (type (child)))
if child.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'ol', 'pre']: # Add tags to collect inner text from html source.
chld = str (child)
print (chld.strip (), file=open (out_file_name, 'a+')) # print valid tags
walker (child)
walker (soup=tmp_soup)
with open (out_file_name) as f: # stripping lines of the file
for lin in f:
print (lin.strip (), file=open (last_file, 'a+'))
print (f'Check file {last_file} for final md file.')
if __name__ == '__main__':
'''
--url: give online page url
--source: offline html file e.g. index.html
'''
argument_parser = argparse.ArgumentParser ()
argument_parser.add_argument ("--url", required=False)
argument_parser.add_argument ("--source", required=False)
args = argument_parser.parse_args ()
if args.url:
generate_md_file_from_html (url=args.url)
elif args.source:
generate_md_file_from_html (source=args.source)
else:
raise f'Error getting source html'