Generate markup file from html file source.
Generate markup file from html file source.
Args are : –url: give online page url , –source: offline html file e.g. index.html
import os import sys, argparse import requests from bs4 import BeautifulSoup, Tag def generate_md_file_from_html(url=None, source=None): out_file_name = 'tmp_out.md' last_file = 'out.md' os.system (f'rm -rf {out_file_name}') os.system (f'rm -rf {last_file}') tmp_soup = '' if url is not None: r = requests.get (url) tmp_soup = r.text tmp_soup = BeautifulSoup (tmp_soup, 'html.parser') elif source is not None: with open (source) as fp: tmp_soup = BeautifulSoup (fp, 'html.parser') else: raise 'Error getting source html' if tmp_soup == '': raise 'Error getting source html' def walker(soup): if soup.name is not None: for child in soup.children: # process node # print( str (child.name) + ":" + str (type (child))) if child.name in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li', 'ol', 'pre']: # Add tags to collect inner text from html source. chld = str (child) print (chld.strip (), file=open (out_file_name, 'a+')) # print valid tags walker (child) walker (soup=tmp_soup) with open (out_file_name) as f: # stripping lines of the file for lin in f: print (lin.strip (), file=open (last_file, 'a+')) print (f'Check file {last_file} for final md file.') if __name__ == '__main__': ''' --url: give online page url --source: offline html file e.g. index.html ''' argument_parser = argparse.ArgumentParser () argument_parser.add_argument ("--url", required=False) argument_parser.add_argument ("--source", required=False) args = argument_parser.parse_args () if args.url: generate_md_file_from_html (url=args.url) elif args.source: generate_md_file_from_html (source=args.source) else: raise f'Error getting source html'